summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/aio.c6
-rw-r--r--fs/autofs4/dev-ioctl.c8
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/autofs4/root.c6
-rw-r--r--fs/bad_inode.c147
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/btrfs/backref.c28
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c63
-rw-r--r--fs/btrfs/ctree.h44
-rw-r--r--fs/btrfs/delayed-inode.c38
-rw-r--r--fs/btrfs/dev-replace.c25
-rw-r--r--fs/btrfs/disk-io.c102
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c295
-rw-r--r--fs/btrfs/extent_io.c93
-rw-r--r--fs/btrfs/extent_io.h65
-rw-r--r--fs/btrfs/file.c87
-rw-r--r--fs/btrfs/free-space-cache.c13
-rw-r--r--fs/btrfs/inode-item.c9
-rw-r--r--fs/btrfs/inode.c211
-rw-r--r--fs/btrfs/ioctl.c4
-rw-r--r--fs/btrfs/ordered-data.c7
-rw-r--r--fs/btrfs/qgroup.c5
-rw-r--r--fs/btrfs/raid56.c103
-rw-r--r--fs/btrfs/raid56.h11
-rw-r--r--fs/btrfs/reada.c19
-rw-r--r--fs/btrfs/relocation.c12
-rw-r--r--fs/btrfs/scrub.c309
-rw-r--r--fs/btrfs/send.c180
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/sysfs.c10
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c201
-rw-r--r--fs/btrfs/tests/qgroup-tests.c23
-rw-r--r--fs/btrfs/transaction.c41
-rw-r--r--fs/btrfs/transaction.h7
-rw-r--r--fs/btrfs/tree-log.c236
-rw-r--r--fs/btrfs/volumes.c249
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/btrfs/xattr.c8
-rw-r--r--fs/cachefiles/daemon.c4
-rw-r--r--fs/cachefiles/interface.c4
-rw-r--r--fs/cachefiles/namei.c16
-rw-r--r--fs/cachefiles/rdwr.c2
-rw-r--r--fs/ceph/acl.c14
-rw-r--r--fs/ceph/addr.c19
-rw-r--r--fs/ceph/caps.c127
-rw-r--r--fs/ceph/dir.c35
-rw-r--r--fs/ceph/file.c39
-rw-r--r--fs/ceph/inode.c41
-rw-r--r--fs/ceph/locks.c9
-rw-r--r--fs/ceph/mds_client.c127
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/snap.c54
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/ceph/super.h5
-rw-r--r--fs/cifs/file.c14
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/configfs/configfs_internal.h3
-rw-r--r--fs/configfs/dir.c72
-rw-r--r--fs/configfs/file.c28
-rw-r--r--fs/configfs/inode.c12
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/dcache.c39
-rw-r--r--fs/debugfs/inode.c36
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h4
-rw-r--r--fs/ecryptfs/file.c36
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/ext4/ext4.h18
-rw-r--r--fs/ext4/indirect.c105
-rw-r--r--fs/ext4/inode.c7
-rw-r--r--fs/ext4/super.c31
-rw-r--r--fs/fs-writeback.c6
-rw-r--r--fs/fuse/dev.c19
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hppfs/hppfs.c4
-rw-r--r--fs/internal.h2
-rw-r--r--fs/jbd2/recovery.c3
-rw-r--r--fs/jffs2/compr_rubin.c5
-rw-r--r--fs/jffs2/dir.c14
-rw-r--r--fs/jffs2/scan.c5
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/kernfs/file.c1
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/locks.c64
-rw-r--r--fs/namei.c170
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c8
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.c49
-rw-r--r--fs/nfs/dir.c22
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c11
-rw-r--r--fs/nfs/filelayout/filelayout.c53
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c43
-rw-r--r--fs/nfs/inode.c112
-rw-r--r--fs/nfs/internal.h14
-rw-r--r--fs/nfs/nfs3proc.c4
-rw-r--r--fs/nfs/nfs3xdr.c5
-rw-r--r--fs/nfs/nfs4client.c9
-rw-r--r--fs/nfs/nfs4proc.c106
-rw-r--r--fs/nfs/nfs4session.c2
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c18
-rw-r--r--fs/nfs/nfs4xdr.c32
-rw-r--r--fs/nfs/pnfs.h4
-rw-r--r--fs/nfs/pnfs_nfs.c30
-rw-r--r--fs/nfs/proc.c6
-rw-r--r--fs/nfs/write.c46
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfsfh.c8
-rw-r--r--fs/nfsd/vfs.c8
-rw-r--r--fs/nilfs2/btree.c47
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/notify/fanotify/fanotify.c9
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/file.c783
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/ocfs2_fs.h15
-rw-r--r--fs/open.c5
-rw-r--r--fs/overlayfs/copy_up.c5
-rw-r--r--fs/overlayfs/dir.c34
-rw-r--r--fs/overlayfs/inode.c12
-rw-r--r--fs/overlayfs/overlayfs.h18
-rw-r--r--fs/overlayfs/readdir.c181
-rw-r--r--fs/overlayfs/super.c579
-rw-r--r--fs/posix_acl.c18
-rw-r--r--fs/proc/generic.c12
-rw-r--r--fs/proc/inode.c21
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/reiserfs/xattr.c4
-rw-r--r--fs/stat.c2
-rw-r--r--fs/super.c40
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/xfs_export.c6
-rw-r--r--fs/xfs/xfs_file.c28
-rw-r--r--fs/xfs/xfs_fsops.c6
-rw-r--r--fs/xfs/xfs_inode.c4
-rw-r--r--fs/xfs/xfs_inode.h9
-rw-r--r--fs/xfs/xfs_ioctl.c11
-rw-r--r--fs/xfs/xfs_iops.c49
-rw-r--r--fs/xfs/xfs_iops.h1
-rw-r--r--fs/xfs/xfs_mount.h11
-rw-r--r--fs/xfs/xfs_pnfs.c324
-rw-r--r--fs/xfs/xfs_pnfs.h18
-rw-r--r--fs/xfs/xfs_qm.c5
159 files changed, 4118 insertions, 2715 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9ee5343..3662f1d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1127,7 +1127,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
}
/* Write all dirty data */
- if (S_ISREG(dentry->d_inode->i_mode))
+ if (d_is_reg(dentry))
filemap_write_and_wait(dentry->d_inode->i_mapping);
retval = p9_client_wstat(fid, &wstat);
diff --git a/fs/aio.c b/fs/aio.c
index 9582865..435ca29 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1296,7 +1296,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
ret = -EINVAL;
if (unlikely(ctx || nr_events == 0)) {
- pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
+ pr_debug("EINVAL: ctx %lu nr_events %u\n",
ctx, nr_events);
goto out;
}
@@ -1344,7 +1344,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
return ret;
}
- pr_debug("EINVAL: io_destroy: invalid context id\n");
+ pr_debug("EINVAL: invalid context id\n");
return -EINVAL;
}
@@ -1528,7 +1528,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
(iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
((ssize_t)iocb->aio_nbytes < 0)
)) {
- pr_debug("EINVAL: io_submit: overflow check\n");
+ pr_debug("EINVAL: overflow check\n");
return -EINVAL;
}
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index aaf96cb..ac7d921 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
*/
static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
- struct autofs_dev_ioctl tmp;
+ struct autofs_dev_ioctl tmp, *res;
if (copy_from_user(&tmp, in, sizeof(tmp)))
return ERR_PTR(-EFAULT);
@@ -106,7 +106,11 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
if (tmp.size > (PATH_MAX + sizeof(tmp)))
return ERR_PTR(-ENAMETOOLONG);
- return memdup_user(in, tmp.size);
+ res = memdup_user(in, tmp.size);
+ if (!IS_ERR(res))
+ res->size = tmp.size;
+
+ return res;
}
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index bfdbaba..11dd118 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -374,7 +374,7 @@ static struct dentry *should_expire(struct dentry *dentry,
return NULL;
}
- if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+ if (dentry->d_inode && d_is_symlink(dentry)) {
DPRINTK("checking symlink %p %pd", dentry, dentry);
/*
* A symlink can't be "busy" in the usual sense so
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index dbb5b72..7e44fdd 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- DPRINTK("file=%p dentry=%p %pD", file, dentry, dentry);
+ DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
if (autofs4_oz_mode(sbi))
goto out;
@@ -371,7 +371,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* having d_mountpoint() true, so there's no need to call back
* to the daemon.
*/
- if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+ if (dentry->d_inode && d_is_symlink(dentry)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
@@ -485,7 +485,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
* an incorrect ELOOP error return.
*/
if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
- (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+ (dentry->d_inode && d_is_symlink(dentry)))
status = -EISDIR;
}
spin_unlock(&sbi->fs_lock);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index afd2b44..861b1e1 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -15,161 +15,14 @@
#include <linux/namei.h>
#include <linux/poll.h>
-
-static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
-{
- return -EIO;
-}
-
-static ssize_t bad_file_read(struct file *filp, char __user *buf,
- size_t size, loff_t *ppos)
-{
- return -EIO;
-}
-
-static ssize_t bad_file_write(struct file *filp, const char __user *buf,
- size_t siz, loff_t *ppos)
-{
- return -EIO;
-}
-
-static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- return -EIO;
-}
-
-static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- return -EIO;
-}
-
-static int bad_file_readdir(struct file *file, struct dir_context *ctx)
-{
- return -EIO;
-}
-
-static unsigned int bad_file_poll(struct file *filp, poll_table *wait)
-{
- return POLLERR;
-}
-
-static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd,
- unsigned long arg)
-{
- return -EIO;
-}
-
-static long bad_file_compat_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- return -EIO;
-}
-
-static int bad_file_mmap(struct file *file, struct vm_area_struct *vma)
-{
- return -EIO;
-}
-
static int bad_file_open(struct inode *inode, struct file *filp)
{
return -EIO;
}
-static int bad_file_flush(struct file *file, fl_owner_t id)
-{
- return -EIO;
-}
-
-static int bad_file_release(struct inode *inode, struct file *filp)
-{
- return -EIO;
-}
-
-static int bad_file_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
-{
- return -EIO;
-}
-
-static int bad_file_aio_fsync(struct kiocb *iocb, int datasync)
-{
- return -EIO;
-}
-
-static int bad_file_fasync(int fd, struct file *filp, int on)
-{
- return -EIO;
-}
-
-static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl)
-{
- return -EIO;
-}
-
-static ssize_t bad_file_sendpage(struct file *file, struct page *page,
- int off, size_t len, loff_t *pos, int more)
-{
- return -EIO;
-}
-
-static unsigned long bad_file_get_unmapped_area(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- return -EIO;
-}
-
-static int bad_file_check_flags(int flags)
-{
- return -EIO;
-}
-
-static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
-{
- return -EIO;
-}
-
-static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out, loff_t *ppos, size_t len,
- unsigned int flags)
-{
- return -EIO;
-}
-
-static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
-{
- return -EIO;
-}
-
static const struct file_operations bad_file_ops =
{
- .llseek = bad_file_llseek,
- .read = bad_file_read,
- .write = bad_file_write,
- .aio_read = bad_file_aio_read,
- .aio_write = bad_file_aio_write,
- .iterate = bad_file_readdir,
- .poll = bad_file_poll,
- .unlocked_ioctl = bad_file_unlocked_ioctl,
- .compat_ioctl = bad_file_compat_ioctl,
- .mmap = bad_file_mmap,
.open = bad_file_open,
- .flush = bad_file_flush,
- .release = bad_file_release,
- .fsync = bad_file_fsync,
- .aio_fsync = bad_file_aio_fsync,
- .fasync = bad_file_fasync,
- .lock = bad_file_lock,
- .sendpage = bad_file_sendpage,
- .get_unmapped_area = bad_file_get_unmapped_area,
- .check_flags = bad_file_check_flags,
- .flock = bad_file_flock,
- .splice_write = bad_file_splice_write,
- .splice_read = bad_file_splice_read,
};
static int bad_inode_create (struct inode *dir, struct dentry *dentry,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 02b1691..995986b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -645,11 +645,12 @@ out:
static unsigned long randomize_stack_top(unsigned long stack_top)
{
- unsigned int random_variable = 0;
+ unsigned long random_variable = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- random_variable = get_random_int() & STACK_RND_MASK;
+ random_variable = (unsigned long) get_random_int();
+ random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
#ifdef CONFIG_STACK_GROWSUP
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8729cf6..f55721f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
return ret;
}
-/*
- * this makes the path point to (inum INODE_ITEM ioff)
- */
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path)
-{
- struct btrfs_key key;
- return btrfs_find_item(fs_root, path, inum, ioff,
- BTRFS_INODE_ITEM_KEY, &key);
-}
-
-static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- struct btrfs_key *found_key)
-{
- return btrfs_find_item(fs_root, path, inum, ioff,
- BTRFS_INODE_REF_KEY, found_key);
-}
-
int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
@@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
- ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+ ret = btrfs_find_item(fs_root, path, parent, 0,
+ BTRFS_INODE_REF_KEY, &found_key);
if (ret > 0)
ret = -ENOENT;
if (ret)
@@ -1727,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
struct btrfs_key found_key;
while (!ret) {
- ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
- &found_key);
+ ret = btrfs_find_item(fs_root, path, inum,
+ parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+ &found_key);
+
if (ret < 0)
break;
if (ret) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2a1ac6b..9c41fba 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -32,9 +32,6 @@ struct inode_fs_paths {
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path);
-
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4aadadc..de5e4f2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -185,6 +185,9 @@ struct btrfs_inode {
struct btrfs_delayed_node *delayed_node;
+ /* File creation time. */
+ struct timespec i_otime;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 14a72ed..6d67f32 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
+ if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+ !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+ return;
+
spin_lock(&root->fs_info->trans_lock);
- if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
- list_empty(&root->dirty_list)) {
- list_add(&root->dirty_list,
- &root->fs_info->dirty_cowonly_roots);
+ if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+ /* Want the extent tree to be the last on the list */
+ if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ list_move_tail(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
+ else
+ list_move(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
}
spin_unlock(&root->fs_info->trans_lock);
}
@@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(eb->start,
- fs_info->tree_root->nodesize);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
if (!eb_rewin) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
} else if (old_root) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+ eb = alloc_dummy_extent_buffer(root->fs_info, logical);
} else {
btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
eb = btrfs_clone_extent_buffer(eb_root);
@@ -1638,14 +1645,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
parent_nritems = btrfs_header_nritems(parent);
blocksize = root->nodesize;
- end_slot = parent_nritems;
+ end_slot = parent_nritems - 1;
- if (parent_nritems == 1)
+ if (parent_nritems <= 1)
return 0;
btrfs_set_lock_blocking(parent);
- for (i = start_slot; i < end_slot; i++) {
+ for (i = start_slot; i <= end_slot; i++) {
int close = 1;
btrfs_node_key(parent, &disk_key, i);
@@ -1662,7 +1669,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
other = btrfs_node_blockptr(parent, i - 1);
close = close_blocks(blocknr, other, blocksize);
}
- if (!close && i < end_slot - 2) {
+ if (!close && i < end_slot) {
other = btrfs_node_blockptr(parent, i + 1);
close = close_blocks(blocknr, other, blocksize);
}
@@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root,
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
gen = btrfs_node_ptr_generation(node, nr);
- readahead_tree_block(root, search, blocksize);
+ readahead_tree_block(root, search);
nread += blocksize;
}
nscan++;
@@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
u64 gen;
u64 block1 = 0;
u64 block2 = 0;
- int blocksize;
parent = path->nodes[level + 1];
if (!parent)
@@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- blocksize = root->nodesize;
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
@@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root,
}
if (block1)
- readahead_tree_block(root, block1, blocksize);
+ readahead_tree_block(root, block1);
if (block2)
- readahead_tree_block(root, block2, blocksize);
+ readahead_tree_block(root, block2);
}
@@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
return 0;
}
-int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
{
int ret;
struct btrfs_key key;
struct extent_buffer *eb;
- struct btrfs_path *path;
+
+ ASSERT(path);
+ ASSERT(found_key);
key.type = key_type;
key.objectid = iobjectid;
key.offset = ioff;
- if (found_path == NULL) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- } else
- path = found_path;
-
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
- if ((ret < 0) || (found_key == NULL)) {
- if (path != found_path)
- btrfs_free_path(path);
+ if (ret < 0)
return ret;
- }
eb = path->nodes[0];
if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
@@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
extent_buffer_get(c);
path->nodes[level] = c;
- path->locks[level] = BTRFS_WRITE_LOCK;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
path->slots[level] = 0;
return 0;
}
@@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
path->search_for_split = 1;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
path->search_for_split = 0;
+ if (ret > 0)
+ ret = -EAGAIN;
if (ret < 0)
goto err;
ret = -EAGAIN;
leaf = path->nodes[0];
- /* if our item isn't there or got smaller, return now */
- if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ /* if our item isn't there, return now */
+ if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0b18070..f9c89ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
@@ -1020,6 +1022,9 @@ enum btrfs_raid_types {
BTRFS_BLOCK_GROUP_RAID6 | \
BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
+ BTRFS_BLOCK_GROUP_RAID6)
+
/*
* We need a bit for restriper to be able to tell when chunks of type
* SINGLE are available. This "extended" profile format is used in
@@ -1239,7 +1244,6 @@ enum btrfs_disk_cache_state {
BTRFS_DC_ERROR = 1,
BTRFS_DC_CLEAR = 2,
BTRFS_DC_SETUP = 3,
- BTRFS_DC_NEED_WRITE = 4,
};
struct btrfs_caching_control {
@@ -1277,7 +1281,6 @@ struct btrfs_block_group_cache {
unsigned long full_stripe_len;
unsigned int ro:1;
- unsigned int dirty:1;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
@@ -1315,6 +1318,9 @@ struct btrfs_block_group_cache {
struct list_head ro_list;
atomic_t trimming;
+
+ /* For dirty block groups */
+ struct list_head dirty_list;
};
/* delayed seq elem */
@@ -1741,6 +1747,7 @@ struct btrfs_fs_info {
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
+ struct mutex unused_bg_unpin_mutex;
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
@@ -1776,6 +1783,7 @@ struct btrfs_subvolume_writers {
#define BTRFS_ROOT_DEFRAG_RUNNING 6
#define BTRFS_ROOT_FORCE_COW 7
#define BTRFS_ROOT_MULTI_LOG_TASKS 8
+#define BTRFS_ROOT_DIRTY 9
/*
* in ram representation of the tree. extent_root is used for all allocations
@@ -1794,8 +1802,6 @@ struct btrfs_root {
struct btrfs_fs_info *fs_info;
struct extent_io_tree dirty_log_pages;
- struct kobject root_kobj;
- struct completion kobj_unregister;
struct mutex objectid_mutex;
spinlock_t accounting_lock;
@@ -2465,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-
-static inline struct btrfs_timespec *
-btrfs_inode_atime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, atime);
- return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, mtime);
- return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, ctime);
- return (struct btrfs_timespec *)ptr;
-}
-
BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
@@ -3406,6 +3387,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_read_block_groups(struct btrfs_root *root);
@@ -3928,6 +3911,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
loff_t actual_len, u64 *alloc_hint);
int btrfs_inode_check_errors(struct inode *inode);
extern const struct dentry_operations btrfs_dentry_operations;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
+#endif
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index de4e70f..82f0c7c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
- btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->atime,
inode->i_atime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->atime,
inode->i_atime.tv_nsec);
- btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->mtime,
inode->i_mtime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->mtime,
inode->i_mtime.tv_nsec);
- btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->ctime,
inode->i_ctime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->ctime,
inode->i_ctime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
}
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
{
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
- struct btrfs_timespec *tspec;
delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
@@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
*rdev = btrfs_stack_inode_rdev(inode_item);
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
- tspec = btrfs_inode_atime(inode_item);
- inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
- tspec = btrfs_inode_mtime(inode_item);
- inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
- tspec = btrfs_inode_ctime(inode_item);
- inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_stack_timespec_sec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_stack_timespec_nsec(&inode_item->otime);
inode->i_generation = BTRFS_I(inode)->generation;
BTRFS_I(inode)->index_cnt = (u64)-1;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ca6a3a3..5ec03d9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -440,18 +440,9 @@ leave:
*/
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
- s64 writers;
- DEFINE_WAIT(wait);
-
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- do {
- prepare_to_wait(&fs_info->replace_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- writers = percpu_counter_sum(&fs_info->bio_counter);
- if (writers)
- schedule();
- finish_wait(&fs_info->replace_wait, &wait);
- } while (writers);
+ wait_event(fs_info->replace_wait, !percpu_counter_sum(
+ &fs_info->bio_counter));
}
/*
@@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
- DEFINE_WAIT(wait);
-again:
- percpu_counter_inc(&fs_info->bio_counter);
- if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+ while (1) {
+ percpu_counter_inc(&fs_info->bio_counter);
+ if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state)))
+ break;
+
btrfs_bio_counter_dec(fs_info);
wait_event(fs_info->replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
- goto again;
}
-
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1afb182..639f266 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk_ratelimited(KERN_INFO
+ printk_ratelimited(KERN_WARNING
"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
"level %d\n",
root->fs_info->sb->s_id, buf->start,
@@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ printk_ratelimited(KERN_ERR
+ "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
eb->fs_info->sb->s_id, eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
@@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
+ printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
"%llu %llu\n",
eb->fs_info->sb->s_id, found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+ printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
goto err;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_info(root->fs_info, "bad tree block level %d",
+ btrfs_err(root->fs_info, "bad tree block level %d",
(int)btrfs_header_level(eb));
ret = -EIO;
goto err;
@@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
{
struct extent_buffer *buf = NULL;
struct inode *btree_inode = root->fs_info->btree_inode;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
@@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
free_extent_buffer(buf);
}
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
int mirror_num, struct extent_buffer **eb)
{
struct extent_buffer *buf = NULL;
@@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return 0;
@@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
}
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize)
+ u64 bytenr)
{
if (btrfs_test_is_dummy_root(root))
- return alloc_test_extent_buffer(root->fs_info, bytenr,
- blocksize);
- return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
+ return alloc_test_extent_buffer(root->fs_info, bytenr);
+ return alloc_extent_buffer(root->fs_info, bytenr);
}
@@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return NULL;
@@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- memset(&root->root_kobj, 0, sizeof(root->root_kobj));
if (fs_info)
root->defrag_trans_start = fs_info->generation;
else
root->defrag_trans_start = 0;
- init_completion(&root->kobj_unregister);
root->root_key.objectid = objectid;
root->anon_dev = 0;
@@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
bool check_ref)
{
struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
int ret;
if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
@@ -1669,8 +1669,17 @@ again:
if (ret)
goto fail;
- ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
- location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ btrfs_free_path(path);
if (ret < 0)
goto fail;
if (ret == 0)
@@ -2232,6 +2241,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
+ mutex_init(&fs_info->unused_bg_unpin_mutex);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
@@ -2496,7 +2506,7 @@ int open_ctree(struct super_block *sb,
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- printk(KERN_ERR "BTRFS: has skinny extents\n");
+ printk(KERN_INFO "BTRFS: has skinny extents\n");
/*
* flag our filesystem as having big metadata blocks if
@@ -2520,7 +2530,7 @@ int open_ctree(struct super_block *sb,
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
(sectorsize != nodesize)) {
- printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
+ printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
goto fail_alloc;
@@ -2628,12 +2638,12 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize_bits = blksize_bits(sectorsize);
if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
+ printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
goto fail_sb_buffer;
}
if (sectorsize != PAGE_SIZE) {
- printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
+ printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
"found on %s\n", (unsigned long)sectorsize, sb->s_id);
goto fail_sb_buffer;
}
@@ -2642,7 +2652,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to read the system "
+ printk(KERN_ERR "BTRFS: failed to read the system "
"array on %s\n", sb->s_id);
goto fail_sb_buffer;
}
@@ -2657,7 +2667,7 @@ int open_ctree(struct super_block *sb,
generation);
if (!chunk_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
- printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2669,7 +2679,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_chunk_tree(chunk_root);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2681,7 +2691,7 @@ int open_ctree(struct super_block *sb,
btrfs_close_extra_devices(fs_info, fs_devices, 0);
if (!fs_devices->latest_bdev) {
- printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2765,7 +2775,7 @@ retry_root_backup:
ret = btrfs_recover_balance(fs_info);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to recover balance\n");
+ printk(KERN_ERR "BTRFS: failed to recover balance\n");
goto fail_block_groups;
}
@@ -3860,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
btrfs_super_log_root(sb));
+ /*
+ * Check the lower bound, the alignment and other constraints are
+ * checked later.
+ */
+ if (btrfs_super_nodesize(sb) < 4096) {
+ printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
+ btrfs_super_nodesize(sb));
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sectorsize(sb) < 4096) {
+ printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
+ btrfs_super_sectorsize(sb));
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
fs_info->fsid, sb->dev_item.fsid);
@@ -3873,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
if (btrfs_super_num_devices(sb) > (1UL << 31))
printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
btrfs_super_num_devices(sb));
+ if (btrfs_super_num_devices(sb) == 0) {
+ printk(KERN_ERR "BTRFS: number of devices is 0\n");
+ ret = -EINVAL;
+ }
if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
@@ -3881,6 +3910,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
}
/*
+ * Obvious sys_chunk_array corruptions, it must hold at least one key
+ * and one chunk
+ */
+ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
+ btrfs_super_sys_array_size(sb),
+ BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk)) {
+ printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
+ btrfs_super_sys_array_size(sb),
+ sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk));
+ ret = -EINVAL;
+ }
+
+ /*
* The generation is a global counter, we'll trust it more than the others
* but it's still possible that it's the one that's wrong.
*/
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4146518..27d44c0 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,11 +46,11 @@ struct btrfs_fs_devices;
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u64 parent_transid);
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize);
+ u64 bytenr);
void clean_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf);
int open_ctree(struct super_block *sb,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a684086..8b353ad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,8 +74,9 @@ enum {
RESERVE_ALLOC_NO_ACCOUNT = 2,
};
-static int update_block_group(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc);
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
*/
ret = 0;
}
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
if (actual_bytes)
@@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- int run_most = 0;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
root = root->fs_info->tree_root;
delayed_refs = &trans->transaction->delayed_refs;
- if (count == 0) {
+ if (count == 0)
count = atomic_read(&delayed_refs->num_entries) * 2;
- run_most = 1;
- }
again:
#ifdef SCRAMBLE_DELAYED_REFS
@@ -3210,6 +3208,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
return 0;
}
+ if (trans->aborted)
+ return 0;
again:
inode = lookup_free_space_inode(root, block_group, path);
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -3245,6 +3245,20 @@ again:
*/
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ /*
+ * So theoretically we could recover from this, simply set the
+ * super cache generation to 0 so we know to invalidate the
+ * cache, but then we'd have to keep track of the block groups
+ * that fail this way so we know we _have_ to reset this cache
+ * before the next commit or risk reading stale cache. So to
+ * limit our exposure to horrible edge cases lets just abort the
+ * transaction, this only happens in really bad situations
+ * anyway.
+ */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_put;
+ }
WARN_ON(ret);
if (i_size_read(inode) > 0) {
@@ -3311,124 +3325,72 @@ out:
return ret;
}
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
- struct btrfs_block_group_cache *cache;
- int err = 0;
+ struct btrfs_block_group_cache *cache, *tmp;
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_path *path;
- u64 last = 0;
+
+ if (list_empty(&cur_trans->dirty_bgs) ||
+ !btrfs_test_opt(root, SPACE_CACHE))
+ return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-again:
- while (1) {
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
- err = cache_save_setup(cache, trans, path);
- last = cache->key.objectid + cache->key.offset;
- btrfs_put_block_group(cache);
+ /* Could add new block groups, use _safe just in case */
+ list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+ dirty_list) {
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ cache_save_setup(cache, trans, path);
}
- while (1) {
- if (last == 0) {
- err = btrfs_run_delayed_refs(trans, root,
- (unsigned long)-1);
- if (err) /* File system offline */
- goto out;
- }
-
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
- btrfs_put_block_group(cache);
- goto again;
- }
-
- if (cache->dirty)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
-
- if (cache->disk_cache_state == BTRFS_DC_SETUP)
- cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
- cache->dirty = 0;
- last = cache->key.objectid + cache->key.offset;
-
- err = write_one_cache_group(trans, root, path, cache);
- btrfs_put_block_group(cache);
- if (err) /* File system offline */
- goto out;
- }
+ btrfs_free_path(path);
+ return 0;
+}
- while (1) {
- /*
- * I don't think this is needed since we're just marking our
- * preallocated extent as written, but just in case it can't
- * hurt.
- */
- if (last == 0) {
- err = btrfs_run_delayed_refs(trans, root,
- (unsigned long)-1);
- if (err) /* File system offline */
- goto out;
- }
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ struct btrfs_path *path;
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- /*
- * Really this shouldn't happen, but it could if we
- * couldn't write the entire preallocated extent and
- * splitting the extent resulted in a new block.
- */
- if (cache->dirty) {
- btrfs_put_block_group(cache);
- goto again;
- }
- if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
+ if (list_empty(&cur_trans->dirty_bgs))
+ return 0;
- err = btrfs_write_out_cache(root, trans, cache, path);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
- /*
- * If we didn't have an error then the cache state is still
- * NEED_WRITE, so we can set it to WRITTEN.
- */
- if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
- cache->disk_cache_state = BTRFS_DC_WRITTEN;
- last = cache->key.objectid + cache->key.offset;
+ /*
+ * We don't need the lock here since we are protected by the transaction
+ * commit. We want to do the cache_save_setup first and then run the
+ * delayed refs to make sure we have the best chance at doing this all
+ * in one shot.
+ */
+ while (!list_empty(&cur_trans->dirty_bgs)) {
+ cache = list_first_entry(&cur_trans->dirty_bgs,
+ struct btrfs_block_group_cache,
+ dirty_list);
+ list_del_init(&cache->dirty_list);
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ cache_save_setup(cache, trans, path);
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, root,
+ (unsigned long) -1);
+ if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
+ btrfs_write_out_cache(root, trans, cache, path);
+ if (!ret)
+ ret = write_one_cache_group(trans, root, path, cache);
btrfs_put_block_group(cache);
}
-out:
btrfs_free_path(path);
- return err;
+ return ret;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5043,19 +5005,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
/**
* drop_outstanding_extent - drop an outstanding extent
* @inode: the inode we're dropping the extent for
+ * @num_bytes: the number of bytes we're relaseing.
*
* This is called when we are freeing up an outstanding extent, either called
* after an error or after an extent is written. This will return the number of
* reserved extents that need to be freed. This must be called with
* BTRFS_I(inode)->lock held.
*/
-static unsigned drop_outstanding_extent(struct inode *inode)
+static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
{
unsigned drop_inode_space = 0;
unsigned dropped_extents = 0;
+ unsigned num_extents = 0;
- BUG_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
+ num_extents = (unsigned)div64_u64(num_bytes +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ ASSERT(num_extents);
+ ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
if (BTRFS_I(inode)->outstanding_extents == 0 &&
test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
@@ -5168,7 +5136,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
+ nr_extents = (unsigned)div64_u64(num_bytes +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ BTRFS_I(inode)->outstanding_extents += nr_extents;
+ nr_extents = 0;
if (BTRFS_I(inode)->outstanding_extents >
BTRFS_I(inode)->reserved_extents)
@@ -5226,7 +5198,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
out_fail:
spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
+ dropped = drop_outstanding_extent(inode, num_bytes);
/*
* If the inodes csum_bytes is the same as the original
* csum_bytes then we know we haven't raced with any free()ers
@@ -5305,7 +5277,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
+ dropped = drop_outstanding_extent(inode, num_bytes);
if (num_bytes)
to_free = calc_csum_metadata_size(inode, num_bytes, 0);
@@ -5313,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_free, 0);
if (root->fs_info->quota_enabled) {
@@ -5375,8 +5350,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
btrfs_free_reserved_data_space(inode, num_bytes);
}
-static int update_block_group(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc)
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5390,14 @@ static int update_block_group(struct btrfs_root *root,
if (!alloc && cache->cached == BTRFS_CACHE_NO)
cache_block_group(cache, 1);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &trans->transaction->dirty_bgs);
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -5424,7 +5408,6 @@ static int update_block_group(struct btrfs_root *root,
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
- cache->dirty = 1;
old_val = btrfs_block_group_used(&cache->item);
num_bytes = min(total, cache->key.offset - byte_in_group);
if (alloc) {
@@ -5807,10 +5790,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
unpin = &fs_info->freed_extents[0];
while (1) {
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
- if (ret)
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
+ }
if (btrfs_test_opt(root, DISCARD))
ret = btrfs_discard_extent(root, start,
@@ -5818,6 +5804,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
clear_extent_dirty(unpin, start, end, GFP_NOFS);
unpin_extent_range(root, start, end, true);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
@@ -6103,7 +6090,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = update_block_group(root, bytenr, num_bytes, 0);
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -6205,7 +6192,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
- struct btrfs_block_group_cache *cache = NULL;
int pin = 1;
int ret;
@@ -6221,17 +6207,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (!last_ref)
return;
- cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-
if (btrfs_header_generation(buf) == trans->transid) {
+ struct btrfs_block_group_cache *cache;
+
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, root, buf->start);
if (!ret)
goto out;
}
+ cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
pin_down_extent(root, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
goto out;
}
@@ -6239,6 +6228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+ btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
@@ -6253,7 +6243,6 @@ out:
* anymore.
*/
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
- btrfs_put_block_group(cache);
}
/* Can return -ENOMEM */
@@ -7063,7 +7052,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = update_block_group(root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -7152,7 +7141,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
- ret = update_block_group(root, ins->objectid, root->nodesize, 1);
+ ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+ 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -7217,11 +7207,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, u32 blocksize, int level)
+ u64 bytenr, int level)
{
struct extent_buffer *buf;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return ERR_PTR(-ENOMEM);
btrfs_set_header_generation(buf, trans->transid);
@@ -7340,7 +7330,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
if (btrfs_test_is_dummy_root(root)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- blocksize, level);
+ level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -7357,8 +7347,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
}
- buf = btrfs_init_new_buffer(trans, root, ins.objectid,
- blocksize, level);
+ buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
BUG_ON(IS_ERR(buf)); /* -ENOMEM */
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
@@ -7487,7 +7476,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- readahead_tree_block(root, bytenr, blocksize);
+ readahead_tree_block(root, bytenr);
nread++;
}
wc->reada_slot = slot;
@@ -7828,7 +7817,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = btrfs_find_tree_block(root, bytenr);
if (!next) {
- next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
return -ENOMEM;
btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
@@ -8548,14 +8537,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
if (IS_ERR(trans))
return PTR_ERR(trans);
- alloc_flags = update_block_group_flags(root, cache->flags);
- if (alloc_flags != cache->flags) {
- ret = do_chunk_alloc(trans, root, alloc_flags,
- CHUNK_ALLOC_FORCE);
- if (ret < 0)
- goto out;
- }
-
ret = set_block_group_ro(cache, 0);
if (!ret)
goto out;
@@ -8566,6 +8547,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
goto out;
ret = set_block_group_ro(cache, 0);
out:
+ if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ check_system_chunk(trans, root, alloc_flags);
+ }
+
btrfs_end_transaction(trans, root);
return ret;
}
@@ -9005,6 +8991,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->dirty_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
@@ -9068,9 +9055,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* b) Setting 'dirty flag' makes sure that we flush
* the new space cache info onto disk.
*/
- cache->disk_cache_state = BTRFS_DC_CLEAR;
if (btrfs_test_opt(root, SPACE_CACHE))
- cache->dirty = 1;
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
}
read_extent_buffer(leaf, &cache->item,
@@ -9460,6 +9446,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
}
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
@@ -9611,7 +9604,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
- trans = btrfs_join_transaction(root);
+ /* 1 for btrfs_orphan_reserve_metadata() */
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_set_block_group_rw(root, block_group);
ret = PTR_ERR(trans);
@@ -9624,18 +9618,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
start = block_group->key.objectid;
end = start + block_group->key.offset - 1;
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N,
+ * another task might be running finish_extent_commit() for the
+ * previous transaction N - 1, and have seen a range belonging
+ * to the block group in freed_extents[] before we were able to
+ * clear the whole block group range from freed_extents[]. This
+ * means that task can lookup for the block group after we
+ * unpinned it from freed_extents[] and removed it, leading to
+ * a BUG_ON() at btrfs_unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_set_block_group_rw(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_set_block_group_rw(root, block_group);
goto end_trans;
}
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/* Reset pinned so btrfs_put_block_group doesn't complain */
block_group->pinned = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c73df6a..d688cfe 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
atomic_read(&state->refs));
@@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->set_bit_hook)
tree->ops->set_bit_hook(tree->mapping->host, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits);
+ struct extent_state *state, unsigned *bits);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned long *bits)
+ unsigned *bits)
{
struct rb_node *node;
@@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned long *bits, int wake)
+ unsigned *bits, int wake)
{
struct extent_state *next;
- unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int wake, int delete,
+ unsigned bits, int wake, int delete,
struct extent_state **cached_state,
gfp_t mask)
{
@@ -789,9 +789,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned long *bits)
+ unsigned *bits)
{
- unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
+ unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
set_state_cb(tree, state, bits);
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree,
static void cache_state_if_flags(struct extent_state *state,
struct extent_state **cached_ptr,
- const u64 flags)
+ unsigned flags)
{
if (cached_ptr && !(*cached_ptr)) {
if (!flags || (state->state & flags)) {
@@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state,
static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long exclusive_bits,
+ unsigned bits, unsigned exclusive_bits,
u64 *failed_start, struct extent_state **cached_state,
gfp_t mask)
{
@@ -1034,7 +1034,7 @@ search_again:
}
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, u64 * failed_start,
+ unsigned bits, u64 * failed_start,
struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, 0, failed_start,
@@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* boundary bits like LOCK.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long clear_bits,
+ unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
@@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask)
+ unsigned bits, gfp_t mask)
{
return set_extent_bit(tree, start, end, bits, NULL,
NULL, mask);
}
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask)
+ unsigned bits, gfp_t mask)
{
return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
}
@@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, struct extent_state **cached_state)
+ unsigned bits, struct extent_state **cached_state)
{
int err;
u64 failed_start;
+
while (1) {
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
EXTENT_LOCKED, &failed_start,
@@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
*/
static struct extent_state *
find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, unsigned long bits)
+ u64 start, unsigned bits)
{
struct rb_node *node;
struct extent_state *state;
@@ -1474,7 +1475,7 @@ out:
* If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned long bits,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1753,7 +1754,7 @@ out_failed:
int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned long clear_bits,
+ unsigned clear_bits,
unsigned long page_ops)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- unsigned long bits, int contig)
+ unsigned bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
@@ -1928,7 +1929,7 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int filled, struct extent_state *cached)
+ unsigned bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
@@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
sector = bbio->stripes[mirror_num-1].physical >> 9;
bio->bi_iter.bi_sector = sector;
dev = bbio->stripes[mirror_num-1].dev;
- kfree(bbio);
+ btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
bio_put(bio);
return -EIO;
@@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
prev_bio_flags);
- if (ret < 0)
+ if (ret < 0) {
+ *bio_ret = NULL;
return ret;
+ }
bio = NULL;
} else {
return 0;
@@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
page,
&delalloc_start,
&delalloc_end,
- 128 * 1024 * 1024);
+ BTRFS_MAX_EXTENT_SIZE);
if (nr_delalloc == 0) {
delalloc_start = delalloc_end + 1;
continue;
@@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
static struct extent_buffer *
__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
- unsigned long len, gfp_t mask)
+ unsigned long len)
{
struct extent_buffer *eb = NULL;
- eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+ eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
if (eb == NULL)
return NULL;
eb->start = start;
@@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
struct extent_buffer *new;
unsigned long num_pages = num_extent_pages(src->start, src->len);
- new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
+ new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (new == NULL)
return NULL;
@@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
return new;
}
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb;
- unsigned long num_pages = num_extent_pages(0, len);
+ unsigned long len;
+ unsigned long num_pages;
unsigned long i;
- eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
+ if (!fs_info) {
+ /*
+ * Called only from tests that don't always have a fs_info
+ * available, but we know that nodesize is 4096
+ */
+ len = 4096;
+ } else {
+ len = fs_info->tree_root->nodesize;
+ }
+ num_pages = num_extent_pages(0, len);
+
+ eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
@@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+ u64 start)
{
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
- eb = alloc_dummy_extent_buffer(start, len);
+ eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
return NULL;
eb->fs_info = fs_info;
@@ -4808,8 +4824,9 @@ free_eb:
#endif
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+ u64 start)
{
+ unsigned long len = fs_info->tree_root->nodesize;
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
unsigned long index = start >> PAGE_CACHE_SHIFT;
@@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
+ eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
@@ -4951,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb)
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_page(eb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
+ __free_extent_buffer(eb);
+ return 1;
+ }
+#endif
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
return 1;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ece9ce8..695b0cc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -4,22 +4,22 @@
#include <linux/rbtree.h>
/* bits for the extent state */
-#define EXTENT_DIRTY 1
-#define EXTENT_WRITEBACK (1 << 1)
-#define EXTENT_UPTODATE (1 << 2)
-#define EXTENT_LOCKED (1 << 3)
-#define EXTENT_NEW (1 << 4)
-#define EXTENT_DELALLOC (1 << 5)
-#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_BOUNDARY (1 << 9)
-#define EXTENT_NODATASUM (1 << 10)
-#define EXTENT_DO_ACCOUNTING (1 << 11)
-#define EXTENT_FIRST_DELALLOC (1 << 12)
-#define EXTENT_NEED_WAIT (1 << 13)
-#define EXTENT_DAMAGED (1 << 14)
-#define EXTENT_NORESERVE (1 << 15)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_DIRTY (1U << 0)
+#define EXTENT_WRITEBACK (1U << 1)
+#define EXTENT_UPTODATE (1U << 2)
+#define EXTENT_LOCKED (1U << 3)
+#define EXTENT_NEW (1U << 4)
+#define EXTENT_DELALLOC (1U << 5)
+#define EXTENT_DEFRAG (1U << 6)
+#define EXTENT_BOUNDARY (1U << 9)
+#define EXTENT_NODATASUM (1U << 10)
+#define EXTENT_DO_ACCOUNTING (1U << 11)
+#define EXTENT_FIRST_DELALLOC (1U << 12)
+#define EXTENT_NEED_WAIT (1U << 13)
+#define EXTENT_DAMAGED (1U << 14)
+#define EXTENT_NORESERVE (1U << 15)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/*
* flags for bio submission. The high bits indicate the compression
@@ -81,9 +81,9 @@ struct extent_io_ops {
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long *bits);
+ unsigned *bits);
void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long *bits);
+ unsigned *bits);
void (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
@@ -108,7 +108,7 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
atomic_t refs;
- unsigned long state;
+ unsigned state;
/* for use by the FS */
u64 private;
@@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
int try_release_extent_buffer(struct page *page);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, struct extent_state **cached);
+ unsigned bits, struct extent_state **cached);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask);
@@ -202,21 +202,21 @@ void extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, unsigned long bits, int contig);
+ u64 max_bytes, unsigned bits, int contig);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int filled,
+ unsigned bits, int filled,
struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask);
+ unsigned bits, gfp_t mask);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int wake, int delete,
+ unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask);
+ unsigned bits, gfp_t mask);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, u64 *failed_start,
+ unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
@@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long clear_bits,
+ unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned long bits,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
@@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+ u64 start);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned long bits_to_clear,
+ unsigned bits_to_clear,
unsigned long page_ops);
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
@@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 *end, u64 max_bytes);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
+ u64 start);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69c9508..aee18f8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
mutex_unlock(&inode->i_mutex);
/*
- * we want to make sure fsync finds this change
- * but we haven't joined a transaction running right now.
- *
- * Later on, someone is sure to update the inode and get the
- * real transid recorded.
- *
- * We set last_trans now to the fs_info generation + 1,
- * this will either be one more than the running transaction
- * or the generation used for the next transaction if there isn't
- * one running right now.
- *
* We also have to set last_sub_trans to the current log transid,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occured.
*/
- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
@@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * check the transaction that last modified this inode
- * and see if its already been committed
- */
- if (!BTRFS_I(inode)->last_trans) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
-
- /*
- * if the last transaction that changed this file was before
- * the current transaction, we can bail out now without any
- * syncing
+ * If the last transaction that changed this file was before the current
+ * transaction and we have the full sync flag set in our inode, we can
+ * bail out now without any syncing.
+ *
+ * Note that we can't bail out if the full sync flag isn't set. This is
+ * because when the full sync flag is set we start all ordered extents
+ * and wait for them to fully complete - when they complete they update
+ * the inode's last_trans field through:
+ *
+ * btrfs_finish_ordered_io() ->
+ * btrfs_update_inode_fallback() ->
+ * btrfs_update_inode() ->
+ * btrfs_set_inode_last_trans()
+ *
+ * So we are sure that last_trans is up to date and can do this check to
+ * bail out safely. For the fast path, when the full sync flag is not
+ * set in our inode, we can not do it because we start only our ordered
+ * extents and don't wait for them to complete (that is when
+ * btrfs_finish_ordered_io runs), so here at this point their last_trans
+ * value might be less than or equals to fs_info->last_trans_committed,
+ * and setting a speculative last_trans for an inode when a buffered
+ * write is made (such as fs_info->generation + 1 for example) would not
+ * be reliable since after setting the value and before fsync is called
+ * any number of transactions can start and commit (transaction kthread
+ * commits the current transaction periodically), and a transaction
+ * commit does not start nor waits for ordered extents to complete.
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed) {
- BTRFS_I(inode)->last_trans = 0;
-
+ (full_sync && BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed)) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2275,6 +2275,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
+ bool truncated_page = false;
+ bool updated_inode = false;
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
@@ -2306,13 +2308,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
* entire page.
*/
if (same_page && len < PAGE_CACHE_SIZE) {
- if (offset < ino_size)
+ if (offset < ino_size) {
+ truncated_page = true;
ret = btrfs_truncate_page(inode, offset, len, 0);
+ } else {
+ ret = 0;
+ }
goto out_only_mutex;
}
/* zero back part of the first page */
if (offset < ino_size) {
+ truncated_page = true;
ret = btrfs_truncate_page(inode, offset, 0, 0);
if (ret) {
mutex_unlock(&inode->i_mutex);
@@ -2348,6 +2355,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
+ truncated_page = true;
ret = btrfs_truncate_page(inode,
tail_start + tail_len, 0, 1);
if (ret)
@@ -2357,8 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
if (lockend < lockstart) {
- mutex_unlock(&inode->i_mutex);
- return 0;
+ ret = 0;
+ goto out_only_mutex;
}
while (1) {
@@ -2506,6 +2514,7 @@ out_trans:
trans->block_rsv = &root->fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
+ updated_inode = true;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
out_free:
@@ -2515,6 +2524,22 @@ out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
out_only_mutex:
+ if (!updated_inode && truncated_page && !ret && !err) {
+ /*
+ * If we only end up zeroing part of a page, we still need to
+ * update the inode item, so that all the time fields are
+ * updated as well as the necessary btrfs inode in memory fields
+ * for detecting, at fsync time, if the inode isn't yet in the
+ * log tree or it's there but not up to date.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ } else {
+ err = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_end_transaction(trans, root);
+ }
+ }
mutex_unlock(&inode->i_mutex);
if (ret && !err)
err = ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d6c03f7..a719785 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct io_ctl io_ctl;
struct btrfs_key key;
struct btrfs_free_space *e, *n;
- struct list_head bitmaps;
+ LIST_HEAD(bitmaps);
u64 num_entries;
u64 num_bitmaps;
u64 generation;
u8 type;
int ret = 0;
- INIT_LIST_HEAD(&bitmaps);
-
/* Nothing in the space cache, goodbye */
if (!i_size_read(inode))
return 0;
@@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
+ enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
root = root->fs_info->tree_root;
@@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
path, block_group->key.objectid);
if (ret) {
- spin_lock(&block_group->lock);
- block_group->disk_cache_state = BTRFS_DC_ERROR;
- spin_unlock(&block_group->lock);
+ dcs = BTRFS_DC_ERROR;
ret = 0;
#ifdef DEBUG
btrfs_err(root->fs_info,
@@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
#endif
}
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = dcs;
+ spin_unlock(&block_group->lock);
iput(inode);
return ret;
}
@@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
min_bytes);
- INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
bytes + empty_size,
cont1_bytes, min_bytes);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 8ffa478..265e03c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->leave_spinning = 1;
+ path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
- if (ret == -EOVERFLOW)
- ret = -EMLINK;
+ if (ret == -EOVERFLOW) {
+ if (find_name_in_backref(path, name, name_len, &ref))
+ ret = -EEXIST;
+ else
+ ret = -EMLINK;
+ }
goto out;
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b214ab1..686331f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
static int btrfs_dirty_inode(struct inode *inode);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode)
+{
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+}
+#endif
+
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
@@ -1530,10 +1537,32 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
static void btrfs_split_extent_hook(struct inode *inode,
struct extent_state *orig, u64 split)
{
+ u64 size;
+
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
+ size = orig->end - orig->start + 1;
+ if (size > BTRFS_MAX_EXTENT_SIZE) {
+ u64 num_extents;
+ u64 new_size;
+
+ /*
+ * See the explanation in btrfs_merge_extent_hook, the same
+ * applies here, just in reverse.
+ */
+ new_size = orig->end - split + 1;
+ num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ new_size = split - orig->start;
+ num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) >= num_extents)
+ return;
+ }
+
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1549,10 +1578,55 @@ static void btrfs_merge_extent_hook(struct inode *inode,
struct extent_state *new,
struct extent_state *other)
{
+ u64 new_size, old_size;
+ u64 num_extents;
+
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
+ if (new->start > other->start)
+ new_size = new->end - other->start + 1;
+ else
+ new_size = other->end - new->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return;
+ }
+
+ /*
+ * We have to add up either side to figure out how many extents were
+ * accounted for before we merged into one big extent. If the number of
+ * extents we accounted for is <= the amount we need for the new range
+ * then we can return, otherwise drop. Think of it like this
+ *
+ * [ 4k][MAX_SIZE]
+ *
+ * So we've grown the extent by a MAX_SIZE extent, this would mean we
+ * need 2 outstanding extents, on one side we have 1 and the other side
+ * we have 1 so they are == and we can return. But in this case
+ *
+ * [MAX_SIZE+4k][MAX_SIZE+4k]
+ *
+ * Each range on their own accounts for 2 extents, but merged together
+ * they are only 3 extents worth of accounting, so we need to drop in
+ * this case.
+ */
+ old_size = other->end - other->start + 1;
+ num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ old_size = new->end - new->start + 1;
+ num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+
+ if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) >= num_extents)
+ return;
+
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents--;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1604,7 +1678,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* have pending delalloc work to be done.
*/
static void btrfs_set_bit_hook(struct inode *inode,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1627,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode,
spin_unlock(&BTRFS_I(inode)->lock);
}
+ /* For sanity tests */
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
root->fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
@@ -1645,9 +1723,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
*/
static void btrfs_clear_bit_hook(struct inode *inode,
struct extent_state *state,
- unsigned long *bits)
+ unsigned *bits)
{
u64 len = state->end + 1 - state->start;
+ u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+ BTRFS_MAX_EXTENT_SIZE);
spin_lock(&BTRFS_I(inode)->lock);
if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@ -1667,7 +1747,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
*bits &= ~EXTENT_FIRST_DELALLOC;
} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents--;
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
spin_unlock(&BTRFS_I(inode)->lock);
}
@@ -1680,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode,
root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len);
+ /* For sanity tests. */
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& do_list && !(state->state & EXTENT_NORESERVE))
btrfs_free_reserved_data_space(inode, len);
@@ -2945,7 +3029,7 @@ static int __readpage_endio_check(struct inode *inode,
return 0;
zeroit:
if (__ratelimit(&_rs))
- btrfs_info(BTRFS_I(inode)->root->fs_info,
+ btrfs_warn(BTRFS_I(inode)->root->fs_info,
"csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum, csum_expected);
memset(kaddr + pgoff, 1, len);
@@ -3407,7 +3491,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
out:
if (ret)
- btrfs_crit(root->fs_info,
+ btrfs_err(root->fs_info,
"could not do orphan cleanup %d", ret);
btrfs_free_path(path);
return ret;
@@ -3490,7 +3574,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
- struct btrfs_timespec *tspec;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key location;
unsigned long ptr;
@@ -3527,17 +3610,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
- tspec = btrfs_inode_atime(inode_item);
- inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
- tspec = btrfs_inode_mtime(inode_item);
- inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
- tspec = btrfs_inode_ctime(inode_item);
- inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
+
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_timespec_sec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_timespec_nsec(leaf, &inode_item->otime);
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3656,21 +3741,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->atime,
inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->atime,
inode->i_atime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->mtime,
inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->mtime,
inode->i_mtime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->ctime,
inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->ctime,
inode->i_ctime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(leaf, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec, &token);
+
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
&token);
btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@ -5007,6 +5097,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
+ struct btrfs_key key;
int ret;
int err = 0;
@@ -5017,9 +5108,12 @@ static int fixup_tree_root_location(struct btrfs_root *root,
}
err = -ENOENT;
- ret = btrfs_find_item(root->fs_info->tree_root, path,
- BTRFS_I(dir)->root->root_key.objectid,
- location->objectid, BTRFS_ROOT_REF_KEY, NULL);
+ key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+ 0, 0);
if (ret) {
if (ret < 0)
err = ret;
@@ -5258,7 +5352,10 @@ static struct inode *new_simple_dir(struct super_block *s,
inode->i_op = &btrfs_dir_ro_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
return inode;
}
@@ -5826,7 +5923,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
+
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@ -7134,17 +7236,28 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
u64 len = bh_result->b_size;
+ u64 *outstanding_extents = NULL;
int unlock_bits = EXTENT_LOCKED;
int ret = 0;
if (create)
- unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+ unlock_bits |= EXTENT_DIRTY;
else
len = min_t(u64, len, root->sectorsize);
lockstart = start;
lockend = start + len - 1;
+ if (current->journal_info) {
+ /*
+ * Need to pull our outstanding extents and set journal_info to NULL so
+ * that anything that needs to check if there's a transction doesn't get
+ * confused.
+ */
+ outstanding_extents = current->journal_info;
+ current->journal_info = NULL;
+ }
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
@@ -7205,7 +7318,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
int type;
- int ret;
u64 block_start, orig_start, orig_block_len, ram_bytes;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -7269,14 +7381,21 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
+ /*
+ * If we have an outstanding_extents count still set then we're
+ * within our reservation, otherwise we need to adjust our inode
+ * counter appropriately.
+ */
+ if (*outstanding_extents) {
+ (*outstanding_extents)--;
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
- ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockstart + len - 1, EXTENT_DELALLOC, NULL,
- &cached_state, GFP_NOFS);
- BUG_ON(ret);
+ current->journal_info = outstanding_extents;
+ btrfs_free_reserved_data_space(inode, len);
}
/*
@@ -7299,6 +7418,8 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+ if (outstanding_extents)
+ current->journal_info = outstanding_extents;
return ret;
}
@@ -7805,8 +7926,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
}
/* async crcs make it difficult to collect full stripe writes. */
- if (btrfs_get_alloc_profile(root, 1) &
- (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+ if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
async_submit = 0;
else
async_submit = 1;
@@ -7999,6 +8119,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ u64 outstanding_extents = 0;
size_t count = 0;
int flags = 0;
bool wakeup = true;
@@ -8036,6 +8157,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
goto out;
+ outstanding_extents = div64_u64(count +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+
+ /*
+ * We need to know how many extents we reserved so that we can
+ * do the accounting properly if we go over the number we
+ * originally calculated. Abuse current->journal_info for this.
+ */
+ current->journal_info = &outstanding_extents;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
inode_dio_done(inode);
@@ -8048,13 +8179,12 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
iter, offset, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (rw & WRITE) {
+ current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED)
btrfs_delalloc_release_space(inode, count);
else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode,
count - (size_t)ret);
- else
- btrfs_delalloc_release_metadata(inode, 0);
}
out:
if (wakeup)
@@ -8575,6 +8705,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->delayed_node = NULL;
+ ei->i_otime.tv_sec = 0;
+ ei->i_otime.tv_nsec = 0;
+
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(&ei->io_tree, &inode->i_data);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d49fe8a..74609b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -776,11 +776,11 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
return -EPERM;
if (isdir) {
- if (!S_ISDIR(victim->d_inode->i_mode))
+ if (!d_is_dir(victim))
return -ENOTDIR;
if (IS_ROOT(victim))
return -EBUSY;
- } else if (S_ISDIR(victim->d_inode->i_mode))
+ } else if (d_is_dir(victim))
return -EISDIR;
if (IS_DEADDIR(dir))
return -ENOENT;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 534544e..157cc54 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -452,9 +452,7 @@ void btrfs_get_logged_extents(struct inode *inode,
continue;
if (entry_end(ordered) <= start)
break;
- if (!list_empty(&ordered->log_list))
- continue;
- if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
continue;
list_add(&ordered->log_list, logged_list);
atomic_inc(&ordered->refs);
@@ -511,8 +509,7 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
- list_add_tail(&ordered->trans_list, &trans->ordered);
+ list_add_tail(&ordered->trans_list, &trans->ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 48b60db..058c79e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
if (oper1->seq < oper2->seq)
return -1;
if (oper1->seq > oper2->seq)
- return -1;
+ return 1;
if (oper1->ref_root < oper2->ref_root)
return -1;
if (oper1->ref_root > oper2->ref_root)
@@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup = u64_to_ptr(unode->aux);
qgroup->rfer += sign * oper->num_bytes;
qgroup->rfer_cmpr += sign * oper->num_bytes;
+ WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
qgroup->excl += sign * oper->num_bytes;
- if (sign < 0)
- WARN_ON(qgroup->excl < oper->num_bytes);
qgroup->excl_cmpr += sign * oper->num_bytes;
qgroup_dirty(fs_info, qgroup);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8ab2a17..5264858 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,15 +58,6 @@
*/
#define RBIO_CACHE_READY_BIT 3
-/*
- * bbio and raid_map is managed by the caller, so we shouldn't free
- * them here. And besides that, all rbios with this flag should not
- * be cached, because we need raid_map to check the rbios' stripe
- * is the same or not, but it is very likely that the caller has
- * free raid_map, so don't cache those rbios.
- */
-#define RBIO_HOLD_BBIO_MAP_BIT 4
-
#define RBIO_CACHE_SIZE 1024
enum btrfs_rbio_ops {
@@ -79,13 +70,6 @@ struct btrfs_raid_bio {
struct btrfs_fs_info *fs_info;
struct btrfs_bio *bbio;
- /*
- * logical block numbers for the start of each stripe
- * The last one or two are p/q. These are sorted,
- * so raid_map[0] is the start of our full stripe
- */
- u64 *raid_map;
-
/* while we're doing rmw on a stripe
* we put it into a hash table so we can
* lock the stripe and merge more rbios
@@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->raid_map[0];
+ u64 num = rbio->bbio->raid_map[0];
/*
* we shift down quite a bit. We're using byte
@@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->raid_map[0] !=
- cur->raid_map[0])
+ if (last->bbio->raid_map[0] !=
+ cur->bbio->raid_map[0])
return 0;
/* we can't merge with different operations */
@@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
walk++;
- if (cur->raid_map[0] == rbio->raid_map[0]) {
+ if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
spin_lock(&cur->bio_list_lock);
/* can we steal this cached rbio's pages? */
@@ -841,21 +825,6 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static inline void
-__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
-{
- if (need) {
- kfree(raid_map);
- kfree(bbio);
- }
-}
-
-static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
-{
- __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
- !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
-}
-
static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
@@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
}
}
- free_bbio_and_raid_map(rbio);
-
+ btrfs_put_bbio(rbio->bbio);
kfree(rbio);
}
@@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len)
+ struct btrfs_bio *bbio, u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
@@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
rbio->bbio = bbio;
- rbio->raid_map = raid_map;
rbio->fs_info = root->fs_info;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
@@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio->bio_pages = p + sizeof(struct page *) * num_pages;
rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
- if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ nr_data = real_stripes - 1;
+ else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
nr_data = real_stripes - 2;
else
- nr_data = real_stripes - 1;
+ BUG();
rbio->nr_data = nr_data;
return rbio;
@@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
spin_lock_irq(&rbio->bio_list_lock);
bio_list_for_each(bio, &rbio->bio_list) {
start = (u64)bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->raid_map[0];
+ stripe_offset = start - rbio->bbio->raid_map[0];
page_index = stripe_offset >> PAGE_CACHE_SHIFT;
for (i = 0; i < bio->bi_vcnt; i++) {
@@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
logical <<= 9;
for (i = 0; i < rbio->nr_data; i++) {
- stripe_start = rbio->raid_map[i];
+ stripe_start = rbio->bbio->raid_map[i];
if (logical >= stripe_start &&
logical < stripe_start + rbio->stripe_len) {
return i;
@@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
* our main entry point for writes from the rest of the FS.
*/
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len)
+ struct btrfs_bio *bbio, u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
int ret;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio)) {
- __free_bbio_and_raid_map(bbio, raid_map, 1);
+ btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
@@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* all raid6 handling here */
- if (rbio->raid_map[rbio->real_stripes - 1] ==
- RAID6_Q_STRIPE) {
-
+ if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
/*
* single failure, rebuild from parity raid5
* style
@@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* here due to a crc mismatch and we can't give them the
* data they want
*/
- if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+ if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bbio->raid_map[faila] ==
+ RAID5_P_STRIPE) {
err = -EIO;
goto cleanup;
}
@@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto pstripe;
}
- if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+ if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
@@ -2001,8 +1967,7 @@ cleanup:
cleanup_io:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- if (err == 0 &&
- !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
+ if (err == 0)
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2156,15 +2121,16 @@ cleanup:
* of the drive.
*/
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, int mirror_num, int generic_io)
+ struct btrfs_bio *bbio, u64 stripe_len,
+ int mirror_num, int generic_io)
{
struct btrfs_raid_bio *rbio;
int ret;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio)) {
- __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+ if (generic_io)
+ btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
@@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
BUG();
- __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+ if (generic_io)
+ btrfs_put_bbio(bbio);
kfree(rbio);
return -EIO;
}
@@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
btrfs_bio_counter_inc_noblocked(root->fs_info);
rbio->generic_bio_cnt = 1;
} else {
- set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+ btrfs_get_bbio(bbio);
}
/*
@@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work)
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
int stripe_offset;
int index;
- ASSERT(logical >= rbio->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+ ASSERT(logical >= rbio->bbio->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->raid_map[0]);
+ stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
index = stripe_offset >> PAGE_CACHE_SHIFT;
rbio->bio_pages[index] = page;
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 31d4a15..2b5d797 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -43,16 +43,15 @@ struct btrfs_raid_bio;
struct btrfs_device;
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, int mirror_num, int generic_io);
+ struct btrfs_bio *bbio, u64 stripe_len,
+ int mirror_num, int generic_io);
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len);
+ struct btrfs_bio *bbio, u64 stripe_len);
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
struct page *page, u64 logical);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index b63ae20..0e7beea 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
struct reada_extent {
u64 logical;
struct btrfs_key top;
- u32 blocksize;
int err;
struct list_head extctl;
int refcnt;
@@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
blocksize = root->nodesize;
re->logical = logical;
- re->blocksize = blocksize;
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
spin_lock_init(&re->lock);
@@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace);
- kfree(bbio);
+ btrfs_put_bbio(bbio);
return re;
error:
@@ -488,7 +486,7 @@ error:
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
- kfree(bbio);
+ btrfs_put_bbio(bbio);
kfree(re);
return re_exist;
}
@@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
int mirror_num = 0;
struct extent_buffer *eb = NULL;
u64 logical;
- u32 blocksize;
int ret;
int i;
int need_kick = 0;
@@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->reada_lock);
return 0;
}
- dev->reada_next = re->logical + re->blocksize;
+ dev->reada_next = re->logical + fs_info->tree_root->nodesize;
re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
}
}
logical = re->logical;
- blocksize = re->blocksize;
spin_lock(&re->lock);
if (re->scheduled_for == NULL) {
@@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
return 0;
atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
- mirror_num, &eb);
+ ret = reada_tree_block_flagged(fs_info->extent_root, logical,
+ mirror_num, &eb);
if (ret)
__readahead_hook(fs_info->extent_root, NULL, logical, ret);
else if (eb)
@@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
break;
printk(KERN_DEBUG
" re: logical %llu size %u empty %d for %lld",
- re->logical, re->blocksize,
+ re->logical, fs_info->tree_root->nodesize,
list_empty(&re->extctl), re->scheduled_for ?
re->scheduled_for->devid : -1);
@@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
printk(KERN_DEBUG
"re: logical %llu size %u list empty %d for %lld",
- re->logical, re->blocksize, list_empty(&re->extctl),
+ re->logical, fs_info->tree_root->nodesize,
+ list_empty(&re->extctl),
re->scheduled_for ? re->scheduled_for->devid : -1);
for (i = 0; i < re->nzones; ++i) {
printk(KERN_CONT " zone %llu-%llu devs",
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74257d6..d830853 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc,
}
}
-static int tree_block_processed(u64 bytenr, u32 blocksize,
- struct reloc_control *rc)
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
+ u32 blocksize = rc->extent_root->nodesize;
+
if (test_range_bit(&rc->processed_blocks, bytenr,
bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
return 1;
@@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
if (!block->key_ready)
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid);
+ readahead_tree_block(rc->extent_root, block->bytenr);
rb_node = rb_next(rb_node);
}
@@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc,
bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
SKINNY_METADATA);
- if (tree_block_processed(bytenr, blocksize, rc))
+ if (tree_block_processed(bytenr, rc))
return 0;
if (tree_search(blocks, bytenr))
@@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc,
if (added)
goto next;
- if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+ if (!tree_block_processed(leaf->start, rc)) {
block = kmalloc(sizeof(*block), GFP_NOFS);
if (!block) {
err = -ENOMEM;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e427cb7e..ec57687 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,7 +66,6 @@ struct scrub_ctx;
struct scrub_recover {
atomic_t refs;
struct btrfs_bio *bbio;
- u64 *raid_map;
u64 map_length;
};
@@ -80,7 +79,7 @@ struct scrub_page {
u64 logical;
u64 physical;
u64 physical_for_dev_replace;
- atomic_t ref_count;
+ atomic_t refs;
struct {
unsigned int mirror_num:8;
unsigned int have_csum:1;
@@ -113,7 +112,7 @@ struct scrub_block {
struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
- atomic_t ref_count; /* free mem on transition to zero */
+ atomic_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
struct {
@@ -142,7 +141,7 @@ struct scrub_parity {
int stripe_len;
- atomic_t ref_count;
+ atomic_t refs;
struct list_head spages;
@@ -194,6 +193,15 @@ struct scrub_ctx {
*/
struct btrfs_scrub_progress stat;
spinlock_t stat_lock;
+
+ /*
+ * Use a ref counter to avoid use-after-free issues. Scrub workers
+ * decrement bios_in_flight and workers_pending and then do a wakeup
+ * on the list_wait wait queue. We must ensure the main scrub task
+ * doesn't free the scrub context before or while the workers are
+ * doing the wakeup() call.
+ */
+ atomic_t refs;
};
struct scrub_fixup_nodatasum {
@@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
- struct btrfs_fs_info *fs_info,
- struct scrub_block *original_sblock,
- u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
@@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
const u8 *csum, u64 generation,
u16 csum_size);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int force_write);
+ struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write);
@@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
static void copy_nocow_pages_worker(struct btrfs_work *work);
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
+ atomic_inc(&sctx->refs);
atomic_inc(&sctx->bios_in_flight);
}
@@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
{
atomic_dec(&sctx->bios_in_flight);
wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
}
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
@@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ atomic_inc(&sctx->refs);
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a worker is running. we must also
@@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
atomic_dec(&sctx->workers_pending);
wake_up(&fs_info->scrub_pause_wait);
wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
}
static void scrub_free_csums(struct scrub_ctx *sctx)
@@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
kfree(sctx);
}
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+ if (atomic_dec_and_test(&sctx->refs))
+ scrub_free_ctx(sctx);
+}
+
static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
{
@@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
if (!sctx)
goto nomem;
+ atomic_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = pages_per_rd_bio;
sctx->curr = -1;
@@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key root_key;
+ struct btrfs_key key;
root_key.objectid = root;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
goto err;
}
- ret = inode_item_info(inum, 0, local_root, swarn->path);
+ /*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+ key.objectid = inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
btrfs_release_path(swarn->path);
goto err;
@@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover)
static inline void scrub_put_recover(struct scrub_recover *recover)
{
if (atomic_dec_and_test(&recover->refs)) {
- kfree(recover->bbio);
- kfree(recover->raid_map);
+ btrfs_put_bbio(recover->bbio);
kfree(recover);
}
}
@@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
}
/* setup the context, map the logical blocks and alloc the pages */
- ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
- logical, sblocks_for_recheck);
+ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
@@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!is_metadata && !have_csum) {
struct scrub_fixup_nodatasum *fixup_nodatasum;
-nodatasum_case:
WARN_ON(sctx->is_dev_replace);
+nodatasum_case:
+
/*
* !is_metadata and !have_csum, this means that the data
* might not be COW'ed, that it might be modified
@@ -1091,76 +1114,20 @@ nodatasum_case:
sblock_other->no_io_error_seen) {
if (sctx->is_dev_replace) {
scrub_write_block_to_dev_replace(sblock_other);
+ goto corrected_error;
} else {
- int force_write = is_metadata || have_csum;
-
ret = scrub_repair_block_from_good_copy(
- sblock_bad, sblock_other,
- force_write);
+ sblock_bad, sblock_other);
+ if (!ret)
+ goto corrected_error;
}
- if (0 == ret)
- goto corrected_error;
}
}
- /*
- * for dev_replace, pick good pages and write to the target device.
- */
- if (sctx->is_dev_replace) {
- success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count;
- page_num++) {
- int sub_success;
-
- sub_success = 0;
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
- struct scrub_block *sblock_other =
- sblocks_for_recheck + mirror_index;
- struct scrub_page *page_other =
- sblock_other->pagev[page_num];
-
- if (!page_other->io_error) {
- ret = scrub_write_page_to_dev_replace(
- sblock_other, page_num);
- if (ret == 0) {
- /* succeeded for this page */
- sub_success = 1;
- break;
- } else {
- btrfs_dev_replace_stats_inc(
- &sctx->dev_root->
- fs_info->dev_replace.
- num_write_errors);
- }
- }
- }
-
- if (!sub_success) {
- /*
- * did not find a mirror to fetch the page
- * from. scrub_write_page_to_dev_replace()
- * handles this case (page->io_error), by
- * filling the block with zeros before
- * submitting the write request
- */
- success = 0;
- ret = scrub_write_page_to_dev_replace(
- sblock_bad, page_num);
- if (ret)
- btrfs_dev_replace_stats_inc(
- &sctx->dev_root->fs_info->
- dev_replace.num_write_errors);
- }
- }
-
- goto out;
- }
+ if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+ goto did_not_correct_error;
/*
- * for regular scrub, repair those pages that are errored.
* In case of I/O errors in the area that is supposed to be
* repaired, continue by picking good copies of those pages.
* Select the good pages from mirrors to rewrite bad pages from
@@ -1184,44 +1151,64 @@ nodatasum_case:
* mirror, even if other 512 byte sectors in the same PAGE_SIZE
* area are unreadable.
*/
-
- /* can only fix I/O errors from here on */
- if (sblock_bad->no_io_error_seen)
- goto did_not_correct_error;
-
success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ for (page_num = 0; page_num < sblock_bad->page_count;
+ page_num++) {
struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_block *sblock_other = NULL;
- if (!page_bad->io_error)
+ /* skip no-io-error page in scrub */
+ if (!page_bad->io_error && !sctx->is_dev_replace)
continue;
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
- struct scrub_block *sblock_other = sblocks_for_recheck +
- mirror_index;
- struct scrub_page *page_other = sblock_other->pagev[
- page_num];
-
- if (!page_other->io_error) {
- ret = scrub_repair_page_from_good_copy(
- sblock_bad, sblock_other, page_num, 0);
- if (0 == ret) {
- page_bad->io_error = 0;
- break; /* succeeded for this page */
+ /* try to find no-io-error page in mirrors */
+ if (page_bad->io_error) {
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ if (!sblocks_for_recheck[mirror_index].
+ pagev[page_num]->io_error) {
+ sblock_other = sblocks_for_recheck +
+ mirror_index;
+ break;
}
}
+ if (!sblock_other)
+ success = 0;
}
- if (page_bad->io_error) {
- /* did not find a mirror to copy the page from */
- success = 0;
+ if (sctx->is_dev_replace) {
+ /*
+ * did not find a mirror to fetch the page
+ * from. scrub_write_page_to_dev_replace()
+ * handles this case (page->io_error), by
+ * filling the block with zeros before
+ * submitting the write request
+ */
+ if (!sblock_other)
+ sblock_other = sblock_bad;
+
+ if (scrub_write_page_to_dev_replace(sblock_other,
+ page_num) != 0) {
+ btrfs_dev_replace_stats_inc(
+ &sctx->dev_root->
+ fs_info->dev_replace.
+ num_write_errors);
+ success = 0;
+ }
+ } else if (sblock_other) {
+ ret = scrub_repair_page_from_good_copy(sblock_bad,
+ sblock_other,
+ page_num, 0);
+ if (0 == ret)
+ page_bad->io_error = 0;
+ else
+ success = 0;
}
}
- if (success) {
+ if (success && !sctx->is_dev_replace) {
if (is_metadata || have_csum) {
/*
* need to verify the checksum now that all
@@ -1288,19 +1275,18 @@ out:
return 0;
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
{
- if (raid_map) {
- if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
- return 3;
- else
- return 2;
- } else {
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ return 2;
+ else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ return 3;
+ else
return (int)bbio->num_stripes;
- }
}
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+ u64 *raid_map,
u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
@@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
{
int i;
- if (raid_map) {
+ if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/* RAID5/6 */
for (i = 0; i < nstripes; i++) {
if (raid_map[i] == RAID6_Q_STRIPE ||
@@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
}
}
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
- struct btrfs_fs_info *fs_info,
- struct scrub_block *original_sblock,
- u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck)
{
+ struct scrub_ctx *sctx = original_sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ u64 length = original_sblock->page_count * PAGE_SIZE;
+ u64 logical = original_sblock->pagev[0]->logical;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
- u64 *raid_map;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
- int page_index;
+ int page_index = 0;
int mirror_index;
int nmirrors;
int ret;
/*
- * note: the two members ref_count and outstanding_pages
+ * note: the two members refs and outstanding_pages
* are not used (and not set) in the blocks that are used for
* the recheck procedure
*/
- page_index = 0;
while (length > 0) {
sublen = min_t(u64, length, PAGE_SIZE);
mapped_length = sublen;
bbio = NULL;
- raid_map = NULL;
/*
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio, 0, &raid_map);
+ &mapped_length, &bbio, 0, 1);
if (ret || !bbio || mapped_length < sublen) {
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
return -ENOMEM;
}
atomic_set(&recover->refs, 1);
recover->bbio = bbio;
- recover->raid_map = raid_map;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
- nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+ nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_page *page;
- if (mirror_index >= BTRFS_MAX_MIRRORS)
- continue;
-
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
page = kzalloc(sizeof(*page), GFP_NOFS);
@@ -1410,9 +1389,12 @@ leave_nomem:
sblock->pagev[page_index] = page;
page->logical = logical;
- scrub_stripe_index_and_offset(logical, raid_map,
+ scrub_stripe_index_and_offset(logical,
+ bbio->map_type,
+ bbio->raid_map,
mapped_length,
- bbio->num_stripes,
+ bbio->num_stripes -
+ bbio->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
@@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error)
static inline int scrub_is_page_on_raid56(struct scrub_page *page)
{
- return page->recover && page->recover->raid_map;
+ return page->recover &&
+ (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
@@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
bio->bi_end_io = scrub_bio_wait_endio;
ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
- page->recover->raid_map,
page->recover->map_length,
page->mirror_num, 0);
if (ret)
@@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int force_write)
+ struct scrub_block *sblock_good)
{
int page_num;
int ret = 0;
@@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
sblock_good,
- page_num,
- force_write);
+ page_num, 1);
if (ret_sub)
ret = ret_sub;
}
@@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
static void scrub_block_get(struct scrub_block *sblock)
{
- atomic_inc(&sblock->ref_count);
+ atomic_inc(&sblock->refs);
}
static void scrub_block_put(struct scrub_block *sblock)
{
- if (atomic_dec_and_test(&sblock->ref_count)) {
+ if (atomic_dec_and_test(&sblock->refs)) {
int i;
if (sblock->sparity)
@@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock)
static void scrub_page_get(struct scrub_page *spage)
{
- atomic_inc(&spage->ref_count);
+ atomic_inc(&spage->refs);
}
static void scrub_page_put(struct scrub_page *spage)
{
- if (atomic_dec_and_test(&spage->ref_count)) {
+ if (atomic_dec_and_test(&spage->refs)) {
if (spage->page)
__free_page(spage->page);
kfree(spage);
@@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->ref_count, 1);
+ atomic_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
@@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->ref_count, 1);
+ atomic_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
@@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_raid_bio *rbio;
struct scrub_page *spage;
struct btrfs_bio *bbio = NULL;
- u64 *raid_map = NULL;
u64 length;
int ret;
@@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
length = sparity->logic_end - sparity->logic_start + 1;
ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
sparity->logic_start,
- &length, &bbio, 0, &raid_map);
- if (ret || !bbio || !raid_map)
+ &length, &bbio, 0, 1);
+ if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
@@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
bio->bi_end_io = scrub_parity_bio_endio;
rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
- raid_map, length,
- sparity->scrub_dev,
+ length, sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
@@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
bbio_out:
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
static void scrub_parity_get(struct scrub_parity *sparity)
{
- atomic_inc(&sparity->ref_count);
+ atomic_inc(&sparity->refs);
}
static void scrub_parity_put(struct scrub_parity *sparity)
{
- if (!atomic_dec_and_test(&sparity->ref_count))
+ if (!atomic_dec_and_test(&sparity->refs))
return;
scrub_parity_check_and_repair(sparity);
@@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->scrub_dev = sdev;
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
- atomic_set(&sparity->ref_count, 1);
+ atomic_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->spages);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
mirror_num = 1;
@@ -3074,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
logical = base + offset;
physical_end = physical + nstripes * map->stripe_len;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical_end, num,
map, &logic_end, NULL);
logic_end += base;
@@ -3121,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
ret = 0;
while (physical < physical_end) {
/* for raid56, we skip parity stripe */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = get_raid56_logic_offset(physical, num,
map, &logical, &stripe_logical);
logical += base;
@@ -3280,8 +3254,7 @@ again:
scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/*
* loop until we find next data stripe
* or we have finished all stripes.
@@ -3775,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_workers_put(fs_info);
mutex_unlock(&fs_info->scrub_lock);
- scrub_free_ctx(sctx);
+ scrub_put_ctx(sctx);
return ret;
}
@@ -3881,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
&mapped_length, &bbio, 0);
if (ret || !bbio || mapped_length < extent_len ||
!bbio->stripes[0].dev->bdev) {
- kfree(bbio);
+ btrfs_put_bbio(bbio);
return;
}
*extent_physical = bbio->stripes[0].physical;
*extent_mirror_num = bbio->mirror_num;
*extent_dev = bbio->stripes[0].dev;
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 804432d..d6033f5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -230,6 +230,7 @@ struct pending_dir_move {
u64 parent_ino;
u64 ino;
u64 gen;
+ bool is_orphan;
struct list_head update_refs;
};
@@ -2471,12 +2472,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
if (ret < 0)
goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
- btrfs_inode_atime(ii));
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
- btrfs_inode_mtime(ii));
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
- btrfs_inode_ctime(ii));
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
/* TODO Add otime support when the otime patches get into upstream */
ret = send_cmd(sctx);
@@ -2987,7 +2985,8 @@ static int add_pending_dir_move(struct send_ctx *sctx,
u64 ino_gen,
u64 parent_ino,
struct list_head *new_refs,
- struct list_head *deleted_refs)
+ struct list_head *deleted_refs,
+ const bool is_orphan)
{
struct rb_node **p = &sctx->pending_dir_moves.rb_node;
struct rb_node *parent = NULL;
@@ -3002,6 +3001,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
pm->parent_ino = parent_ino;
pm->ino = ino;
pm->gen = ino_gen;
+ pm->is_orphan = is_orphan;
INIT_LIST_HEAD(&pm->list);
INIT_LIST_HEAD(&pm->update_refs);
RB_CLEAR_NODE(&pm->node);
@@ -3134,16 +3134,20 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
rmdir_ino = dm->rmdir_ino;
free_waiting_dir_move(sctx, dm);
- ret = get_first_ref(sctx->parent_root, pm->ino,
- &parent_ino, &parent_gen, name);
- if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, parent_ino, parent_gen,
- from_path);
- if (ret < 0)
- goto out;
- ret = fs_path_add_path(from_path, name);
+ if (pm->is_orphan) {
+ ret = gen_unique_name(sctx, pm->ino,
+ pm->gen, from_path);
+ } else {
+ ret = get_first_ref(sctx->parent_root, pm->ino,
+ &parent_ino, &parent_gen, name);
+ if (ret < 0)
+ goto out;
+ ret = get_cur_path(sctx, parent_ino, parent_gen,
+ from_path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(from_path, name);
+ }
if (ret < 0)
goto out;
@@ -3153,7 +3157,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
LIST_HEAD(deleted_refs);
ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
- &pm->update_refs, &deleted_refs);
+ &pm->update_refs, &deleted_refs,
+ pm->is_orphan);
if (ret < 0)
goto out;
if (rmdir_ino) {
@@ -3286,6 +3291,127 @@ out:
return ret;
}
+/*
+ * We might need to delay a directory rename even when no ancestor directory
+ * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
+ * renamed. This happens when we rename a directory to the old name (the name
+ * in the parent root) of some other unrelated directory that got its rename
+ * delayed due to some ancestor with higher number that got renamed.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ * . (ino 256)
+ * |---- a/ (ino 257)
+ * | |---- file (ino 260)
+ * |
+ * |---- b/ (ino 258)
+ * |---- c/ (ino 259)
+ *
+ * Send snapshot:
+ * . (ino 256)
+ * |---- a/ (ino 258)
+ * |---- x/ (ino 259)
+ * |---- y/ (ino 257)
+ * |----- file (ino 260)
+ *
+ * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
+ * from 'a' to 'x/y' happening first, which in turn depends on the rename of
+ * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
+ * must issue is:
+ *
+ * 1 - rename 259 from 'c' to 'x'
+ * 2 - rename 257 from 'a' to 'x/y'
+ * 3 - rename 258 from 'b' to 'a'
+ *
+ * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
+ * be done right away and < 0 on error.
+ */
+static int wait_for_dest_dir_move(struct send_ctx *sctx,
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key di_key;
+ struct btrfs_dir_item *di;
+ u64 left_gen;
+ u64 right_gen;
+ int ret = 0;
+
+ if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
+ return 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = parent_ref->dir;
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
+
+ ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_match_dir_item_name(sctx->parent_root, path,
+ parent_ref->name, parent_ref->name_len);
+ if (!di) {
+ ret = 0;
+ goto out;
+ }
+ /*
+ * di_key.objectid has the number of the inode that has a dentry in the
+ * parent directory with the same name that sctx->cur_ino is being
+ * renamed to. We need to check if that inode is in the send root as
+ * well and if it is currently marked as an inode with a pending rename,
+ * if it is, we need to delay the rename of sctx->cur_ino as well, so
+ * that it happens after that other inode is renamed.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
+ if (di_key.type != BTRFS_INODE_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
+ &left_gen, NULL, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
+ &right_gen, NULL, NULL, NULL, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
+
+ /* Different inode, no need to delay the rename of sctx->cur_ino */
+ if (right_gen != left_gen) {
+ ret = 0;
+ goto out;
+ }
+
+ if (is_waiting_for_move(sctx, di_key.objectid)) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ di_key.objectid,
+ &sctx->new_refs,
+ &sctx->deleted_refs,
+ is_orphan);
+ if (!ret)
+ ret = 1;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int wait_for_parent_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref)
{
@@ -3352,7 +3478,8 @@ out:
sctx->cur_inode_gen,
ino,
&sctx->new_refs,
- &sctx->deleted_refs);
+ &sctx->deleted_refs,
+ false);
if (!ret)
ret = 1;
}
@@ -3375,6 +3502,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
int did_overwrite = 0;
int is_orphan = 0;
u64 last_dir_ino_rm = 0;
+ bool can_rename = true;
verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
@@ -3493,12 +3621,22 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
}
}
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
+ ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
/*
* link/move the ref to the new place. If we have an orphan
* inode, move it and update valid_path. If not, link or move
* it depending on the inode mode.
*/
- if (is_orphan) {
+ if (is_orphan && can_rename) {
ret = send_rename(sctx, valid_path, cur->full_path);
if (ret < 0)
goto out;
@@ -3506,7 +3644,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
ret = fs_path_copy(valid_path, cur->full_path);
if (ret < 0)
goto out;
- } else {
+ } else if (can_rename) {
if (S_ISDIR(sctx->cur_inode_mode)) {
/*
* Dirs can't be linked, so move it. For moved
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6f49b28..05fef19 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1958,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans, root);
}
-static int btrfs_unfreeze(struct super_block *sb)
-{
- return 0;
-}
-
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2011,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = {
.statfs = btrfs_statfs,
.remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
- .unfreeze_fs = btrfs_unfreeze,
};
static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 92db3f6..94edb0a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -733,10 +733,18 @@ int btrfs_init_sysfs(void)
ret = btrfs_init_debugfs();
if (ret)
- return ret;
+ goto out1;
init_feature_attrs();
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+ if (ret)
+ goto out2;
+
+ return 0;
+out2:
+ debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+out1:
+ kset_unregister(btrfs_kset);
return ret;
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index cc286ce..f51963a 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -53,7 +53,7 @@ static int test_btrfs_split_item(void)
return -ENOMEM;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+ path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
if (!eb) {
test_msg("Could not allocate dummy buffer\n");
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 7e99c2f..9e9f236 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -258,8 +258,7 @@ static int test_find_delalloc(void)
}
ret = 0;
out_bits:
- clear_extent_bits(&tmp, 0, total_dirty - 1,
- (unsigned long)-1, GFP_NOFS);
+ clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
out:
if (locked_page)
page_cache_release(locked_page);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 3ae0f5b..054fc0d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void)
goto out;
}
- root->node = alloc_dummy_extent_buffer(0, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -843,7 +843,7 @@ static int test_hole_first(void)
goto out;
}
- root->node = alloc_dummy_extent_buffer(0, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -911,6 +911,197 @@ out:
return ret;
}
+static int test_extent_accounting(void)
+{
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Couldn't allocate inode\n");
+ return ret;
+ }
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ goto out;
+ }
+
+ BTRFS_I(inode)->root = root;
+ btrfs_test_inode_set_ops(inode);
+
+ /* [BTRFS_MAX_EXTENT_SIZE] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 1) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 1, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE][4k] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+ BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ EXTENT_DELALLOC | EXTENT_DIRTY |
+ EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE][4K] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+ *
+ * I'm artificially adding 2 to outstanding_extents because in the
+ * buffered IO case we'd add things up as we go, but I don't feel like
+ * doing that here, this isn't the interesting case we want to test.
+ */
+ BTRFS_I(inode)->outstanding_extents += 2;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
+ (BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 4) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 4, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 3) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 3, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 4) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 4, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * Refill the hole again just for good measure, because I thought it
+ * might fail and I'd rather satisfy my paranoia at this point.
+ */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 3) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 3, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* Empty */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 0, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (ret)
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
int btrfs_test_inodes(void)
{
int ret;
@@ -924,5 +1115,9 @@ int btrfs_test_inodes(void)
if (ret)
return ret;
test_msg("Running hole first btrfs_get_extent test\n");
- return test_hole_first();
+ ret = test_hole_first();
+ if (ret)
+ return ret;
+ test_msg("Running outstanding_extents tests\n");
+ return test_extent_accounting();
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ec3dcb2..73f299e 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -404,12 +404,22 @@ int btrfs_test_qgroups(void)
ret = -ENOMEM;
goto out;
}
+ /* We are using this root as our extent root */
+ root->fs_info->extent_root = root;
+
+ /*
+ * Some of the paths we test assume we have a filled out fs_info, so we
+ * just need to add the root in there so we don't panic.
+ */
+ root->fs_info->tree_root = root;
+ root->fs_info->quota_root = root;
+ root->fs_info->quota_enabled = 1;
/*
* Can't use bytenr 0, some things freak out
* *cough*backref walking code*cough*
*/
- root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -448,17 +458,6 @@ int btrfs_test_qgroups(void)
goto out;
}
- /* We are using this root as our extent root */
- root->fs_info->extent_root = root;
-
- /*
- * Some of the paths we test assume we have a filled out fs_info, so we
- * just need to addt he root in there so we don't panic.
- */
- root->fs_info->tree_root = root;
- root->fs_info->quota_root = root;
- root->fs_info->quota_enabled = 1;
-
test_msg("Running qgroup tests\n");
ret = test_no_shared_qgroup(root);
if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e88b59d..8be4278 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -220,6 +220,7 @@ loop:
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
+ cur_trans->have_free_bgs = 0;
cur_trans->start_time = get_seconds();
cur_trans->delayed_refs.href_root = RB_ROOT;
@@ -248,6 +249,8 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
INIT_LIST_HEAD(&cur_trans->pending_ordered);
+ INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+ spin_lock_init(&cur_trans->dirty_bgs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -1022,7 +1025,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = root->fs_info->tree_root;
old_root_used = btrfs_root_used(&root->root_item);
- btrfs_write_dirty_block_groups(trans, root);
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -1038,9 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
return ret;
old_root_used = btrfs_root_used(&root->root_item);
- ret = btrfs_write_dirty_block_groups(trans, root);
- if (ret)
- return ret;
}
return 0;
@@ -1057,14 +1056,11 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *next;
struct extent_buffer *eb;
int ret;
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret)
- return ret;
-
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
0, &eb);
@@ -1088,15 +1084,20 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
if (ret)
return ret;
+ ret = btrfs_setup_space_cache(trans, root);
+ if (ret)
+ return ret;
+
/* run_qgroups might have added some more refs */
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret)
return ret;
-
+again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
+ clear_bit(BTRFS_ROOT_DIRTY, &root->state);
if (root != fs_info->extent_root)
list_add_tail(&root->dirty_list,
@@ -1104,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
+ }
+
+ while (!list_empty(dirty_bgs)) {
+ ret = btrfs_write_dirty_block_groups(trans, root);
+ if (ret)
+ return ret;
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
}
+ if (!list_empty(&fs_info->dirty_cowonly_roots))
+ goto again;
+
list_add_tail(&fs_info->extent_root->dirty_list,
&trans->transaction->switch_commits);
btrfs_after_dev_replace_commit(fs_info);
@@ -1803,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wait_for_commit(root, cur_trans);
+ if (unlikely(cur_trans->aborted))
+ ret = cur_trans->aborted;
+
btrfs_put_transaction(cur_trans);
return ret;
@@ -1983,6 +2002,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
switch_commit_roots(cur_trans, root->fs_info);
assert_qgroups_uptodate(trans);
+ ASSERT(list_empty(&cur_trans->dirty_bgs));
update_super_roots(root);
btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -2026,6 +2046,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
+ if (cur_trans->have_free_bgs)
+ btrfs_clear_space_info_full(root->fs_info);
+
root->fs_info->last_trans_committed = cur_trans->transid;
/*
* We needn't acquire the lock here because there is no other task
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 00ed29c..937050a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,11 @@ struct btrfs_transaction {
atomic_t num_writers;
atomic_t use_count;
+ /*
+ * true if there is free bgs operations in this transaction
+ */
+ int have_free_bgs;
+
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
struct list_head list;
@@ -58,6 +63,8 @@ struct btrfs_transaction {
struct list_head pending_chunks;
struct list_head pending_ordered;
struct list_head switch_commits;
+ struct list_head dirty_bgs;
+ spinlock_t dirty_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1a9585d..c5b8ba3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
insert:
btrfs_release_path(path);
/* try to insert the key into the destination tree */
+ path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
+ path->skip_release_on_error = 0;
/* make sure any existing item is the correct size */
- if (ret == -EEXIST) {
+ if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
@@ -488,8 +490,20 @@ insert:
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0)
+ if (btrfs_inode_generation(eb, src_item) == 0) {
+ struct extent_buffer *dst_eb = path->nodes[0];
+
+ if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+ struct btrfs_map_token token;
+ u64 ino_size = btrfs_inode_size(eb, src_item);
+
+ btrfs_init_map_token(&token);
+ btrfs_set_token_inode_size(dst_eb, dst_item,
+ ino_size, &token);
+ }
goto no_copy;
+ }
if (overwrite_root &&
S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -844,7 +858,7 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
- char *name, int namelen)
+ const char *name, int namelen)
{
struct btrfs_path *path;
struct btrfs_inode_ref *ref;
@@ -998,7 +1012,7 @@ again:
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
- extref = (struct btrfs_inode_extref *)base + cur_offset;
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
@@ -1254,13 +1268,14 @@ out:
}
static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset)
+ struct btrfs_root *root, u64 ino)
{
int ret;
- ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
- offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
- if (ret > 0)
- ret = btrfs_insert_orphan_item(trans, root, offset);
+
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
+
return ret;
}
@@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
}
btrfs_release_path(path);
- if (ret < 0)
+ if (ret < 0 && ret != -ENOENT)
return ret;
return nlink;
}
@@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
nlink = ret;
ret = count_inode_extrefs(root, inode, path);
- if (ret == -ENOENT)
- ret = 0;
-
if (ret < 0)
goto out;
@@ -1557,6 +1570,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
}
/*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+ const char *name, const int name_len,
+ const u64 dirid, const u64 ino)
+{
+ struct btrfs_key search_key;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = dirid;
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ return false;
+}
+
+/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
@@ -1666,10 +1703,17 @@ out:
return ret;
insert:
+ if (name_in_log_ref(root->log_root, name, name_len,
+ key->objectid, log_key.objectid)) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
btrfs_release_path(path);
ret = insert_one_name(trans, root, path, key->objectid, key->offset,
name, name_len, log_type, &log_key);
- if (ret && ret != -ENOENT)
+ if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
update_size = false;
ret = 0;
@@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
- next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
return -ENOMEM;
@@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
if (atomic_read(&root->log_writers))
schedule();
- mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
+ mutex_lock(&root->log_mutex);
}
}
@@ -3219,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only)
+ struct inode *inode, int log_inode_only,
+ u64 logged_isize)
{
struct btrfs_map_token token;
@@ -3232,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* to say 'update this inode with these values'
*/
btrfs_set_token_inode_generation(leaf, item, 0, &token);
- btrfs_set_token_inode_size(leaf, item, 0, &token);
+ btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
} else {
btrfs_set_token_inode_generation(leaf, item,
BTRFS_I(inode)->generation,
@@ -3245,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->atime,
inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->atime,
inode->i_atime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->mtime,
inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->mtime,
inode->i_mtime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->ctime,
inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->ctime,
inode->i_ctime.tv_nsec, &token);
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
@@ -3284,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+ fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
btrfs_release_path(path);
return 0;
}
@@ -3293,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *dst_path,
struct btrfs_path *src_path, u64 *last_extent,
- int start_slot, int nr, int inode_only)
+ int start_slot, int nr, int inode_only,
+ u64 logged_isize)
{
unsigned long src_offset;
unsigned long dst_offset;
@@ -3350,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
dst_path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
- inode, inode_only == LOG_INODE_EXISTS);
+ inode, inode_only == LOG_INODE_EXISTS,
+ logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
src_offset, ins_sizes[i]);
@@ -3902,6 +3949,33 @@ process:
return ret;
}
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+ struct btrfs_path *path, u64 *size_ret)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *size_ret = i_size_read(inode);
+ } else {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ *size_ret = btrfs_inode_size(path->nodes[0], item);
+ }
+
+ btrfs_release_path(path);
+ return 0;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -3939,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 logged_isize = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3966,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- /* Only run delayed items if we are a dir or a new file */
+ /*
+ * Only run delayed items if we are a dir or a new file.
+ * Otherwise commit the delayed inode only, which is needed in
+ * order for the log replay code to mark inodes for link count
+ * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ */
if (S_ISDIR(inode->i_mode) ||
- BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+ BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
ret = btrfs_commit_inode_delayed_items(trans, inode);
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
- }
+ else
+ ret = btrfs_commit_inode_delayed_inode(inode);
+
+ if (ret) {
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
}
mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -3988,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key_type = BTRFS_INODE_EXTREF_KEY;
+ max_key.type = max_key_type;
+ }
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
- if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags);
- ret = btrfs_truncate_inode_items(trans, log,
- inode, 0, 0);
- } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags) ||
+ if (inode_only == LOG_INODE_EXISTS) {
+ /*
+ * Make sure the new inode item we write to the log has
+ * the same isize as the current one (if it exists).
+ * This is necessary to prevent data loss after log
+ * replay, and also to prevent doing a wrong expanding
+ * truncate - for e.g. create file, write 4K into offset
+ * 0, fsync, write 4K into offset 4096, add hard link,
+ * fsync some other file (to sync log), power fail - if
+ * we use the inode's current i_size, after log replay
+ * we get a 8Kb file, with the last 4Kb extent as a hole
+ * (zeroes), as if an expanding truncate happened,
+ * instead of getting a file of 4Kb only.
+ */
+ err = logged_inode_size(log, inode, path,
+ &logged_isize);
+ if (err)
+ goto out_unlock;
+ }
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key.type = BTRFS_INODE_EXTREF_KEY;
+ ret = drop_objectid_items(trans, log, path, ino,
+ max_key.type);
+ } else {
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
+ ret = btrfs_truncate_inode_items(trans, log,
+ inode, 0, 0);
+ }
+ } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags) ||
inode_only == LOG_INODE_EXISTS) {
- if (inode_only == LOG_INODE_ALL)
+ if (inode_only == LOG_INODE_ALL) {
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
fast_search = true;
- max_key.type = BTRFS_XATTR_ITEM_KEY;
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ } else {
+ max_key.type = BTRFS_INODE_EXTREF_KEY;
+ }
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
@@ -4047,7 +4163,8 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, &last_extent,
- ins_start_slot, ins_nr, inode_only);
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4071,7 +4188,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
&last_extent, ins_start_slot,
- ins_nr, inode_only);
+ ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4092,7 +4209,8 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, &last_extent,
- ins_start_slot, ins_nr, inode_only);
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4273,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
+ const struct dentry * const first_parent = parent;
+ const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
+ last_committed);
sb = inode->i_sb;
@@ -4328,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- inode_only = LOG_INODE_EXISTS;
while (1) {
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
@@ -4337,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (root != BTRFS_I(inode)->root)
break;
+ /*
+ * On unlink we must make sure our immediate parent directory
+ * inode is fully logged. This is to prevent leaving dangling
+ * directory index entries and a wrong directory inode's i_size.
+ * Not doing so can result in a directory being impossible to
+ * delete after log replay (rmdir will always fail with error
+ * -ENOTEMPTY).
+ */
+ if (did_unlink && parent == first_parent)
+ inode_only = LOG_INODE_ALL;
+ else
+ inode_only = LOG_INODE_EXISTS;
+
if (BTRFS_I(inode)->generation >
- root->fs_info->last_trans_committed) {
+ root->fs_info->last_trans_committed ||
+ inode_only == LOG_INODE_ALL) {
ret = btrfs_log_inode(trans, root, inode, inode_only,
0, LLONG_MAX, ctx);
if (ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 50c5a87..8222f6f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1310,6 +1310,8 @@ again:
if (ret) {
btrfs_error(root->fs_info, ret,
"Failed to remove dev extent item");
+ } else {
+ trans->transaction->have_free_bgs = 1;
}
out:
btrfs_free_path(path);
@@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
- if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+ if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
return;
btrfs_set_fs_incompat(info, RAID56);
@@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
- }
free_extent_map(em);
return len;
}
@@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6))
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
return ret;
@@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b)
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
{
struct btrfs_bio_stripe s;
- int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
int i;
u64 l;
int again = 1;
- int m;
while (again) {
again = 0;
- for (i = 0; i < real_stripes - 1; i++) {
- if (parity_smaller(raid_map[i], raid_map[i+1])) {
+ for (i = 0; i < num_stripes - 1; i++) {
+ if (parity_smaller(bbio->raid_map[i],
+ bbio->raid_map[i+1])) {
s = bbio->stripes[i];
- l = raid_map[i];
+ l = bbio->raid_map[i];
bbio->stripes[i] = bbio->stripes[i+1];
- raid_map[i] = raid_map[i+1];
+ bbio->raid_map[i] = bbio->raid_map[i+1];
bbio->stripes[i+1] = s;
- raid_map[i+1] = l;
-
- if (bbio->tgtdev_map) {
- m = bbio->tgtdev_map[i];
- bbio->tgtdev_map[i] =
- bbio->tgtdev_map[i + 1];
- bbio->tgtdev_map[i + 1] = m;
- }
+ bbio->raid_map[i+1] = l;
again = 1;
}
@@ -4909,10 +4900,48 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
}
}
+static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+{
+ struct btrfs_bio *bbio = kzalloc(
+ /* the size of the btrfs_bio */
+ sizeof(struct btrfs_bio) +
+ /* plus the variable array for the stripes */
+ sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+ /* plus the variable array for the tgt dev */
+ sizeof(int) * (real_stripes) +
+ /*
+ * plus the raid_map, which includes both the tgt dev
+ * and the stripes
+ */
+ sizeof(u64) * (total_stripes),
+ GFP_NOFS);
+ if (!bbio)
+ return NULL;
+
+ atomic_set(&bbio->error, 0);
+ atomic_set(&bbio->refs, 1);
+
+ return bbio;
+}
+
+void btrfs_get_bbio(struct btrfs_bio *bbio)
+{
+ WARN_ON(!atomic_read(&bbio->refs));
+ atomic_inc(&bbio->refs);
+}
+
+void btrfs_put_bbio(struct btrfs_bio *bbio)
+{
+ if (!bbio)
+ return;
+ if (atomic_dec_and_test(&bbio->refs))
+ kfree(bbio);
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
- int mirror_num, u64 **raid_map_ret)
+ int mirror_num, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -4925,7 +4954,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 stripe_nr_orig;
u64 stripe_nr_end;
u64 stripe_len;
- u64 *raid_map = NULL;
int stripe_index;
int i;
int ret = 0;
@@ -4976,7 +5004,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_offset = offset - stripe_offset;
/* if we're here for raid56, we need to know the stripe aligned start */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
raid56_full_stripe_start = offset;
@@ -4989,8 +5017,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & REQ_DISCARD) {
/* we don't discard raid56 yet */
- if (map->type &
- (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
@@ -5000,7 +5027,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
stripe (on a single disk). */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
(rw & REQ_WRITE)) {
max_len = stripe_len * nr_data_stripes(map) -
(offset - raid56_full_stripe_start);
@@ -5047,7 +5074,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 physical_of_found = 0;
ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0, NULL);
+ logical, &tmp_length, &tmp_bbio, 0, 0);
if (ret) {
WARN_ON(tmp_bbio != NULL);
goto out;
@@ -5061,7 +5088,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
* is not left of the left cursor
*/
ret = -EIO;
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
goto out;
}
@@ -5096,11 +5123,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
} else {
WARN_ON(1);
ret = -EIO;
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
goto out;
}
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
@@ -5166,15 +5193,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
mirror_num = stripe_index - old_stripe_index + 1;
}
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
- u64 tmp;
-
- if (raid_map_ret &&
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ if (need_raid_map &&
((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
- int i, rot;
-
/* push stripe_nr back to the start of the full stripe */
stripe_nr = raid56_full_stripe_start;
do_div(stripe_nr, stripe_len * nr_data_stripes(map));
@@ -5183,32 +5205,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = map->num_stripes;
max_errors = nr_parity_stripes(map);
- raid_map = kmalloc_array(num_stripes, sizeof(u64),
- GFP_NOFS);
- if (!raid_map) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* Work out the disk rotation on this stripe-set */
- tmp = stripe_nr;
- rot = do_div(tmp, num_stripes);
-
- /* Fill in the logical address of each stripe */
- tmp = stripe_nr * nr_data_stripes(map);
- for (i = 0; i < nr_data_stripes(map); i++)
- raid_map[(i+rot) % num_stripes] =
- em->start + (tmp + i) * map->stripe_len;
-
- raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
- if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- raid_map[(i+rot+1) % num_stripes] =
- RAID6_Q_STRIPE;
-
*length = map->stripe_len;
stripe_index = 0;
stripe_offset = 0;
} else {
+ u64 tmp;
+
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
@@ -5246,17 +5248,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
tgtdev_indexes = num_stripes;
}
- bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
- GFP_NOFS);
+ bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
if (!bbio) {
- kfree(raid_map);
ret = -ENOMEM;
goto out;
}
- atomic_set(&bbio->error, 0);
if (dev_replace_is_ongoing)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+ /* build raid_map */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+ need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+ mirror_num > 1)) {
+ u64 tmp;
+ int i, rot;
+
+ bbio->raid_map = (u64 *)((void *)bbio->stripes +
+ sizeof(struct btrfs_bio_stripe) *
+ num_alloc_stripes +
+ sizeof(int) * tgtdev_indexes);
+
+ /* Work out the disk rotation on this stripe-set */
+ tmp = stripe_nr;
+ rot = do_div(tmp, num_stripes);
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * nr_data_stripes(map);
+ for (i = 0; i < nr_data_stripes(map); i++)
+ bbio->raid_map[(i+rot) % num_stripes] =
+ em->start + (tmp + i) * map->stripe_len;
+
+ bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ bbio->raid_map[(i+rot+1) % num_stripes] =
+ RAID6_Q_STRIPE;
+ }
+
if (rw & REQ_DISCARD) {
int factor = 0;
int sub_stripes = 0;
@@ -5340,6 +5367,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
max_errors = btrfs_chunk_max_errors(map);
+ if (bbio->raid_map)
+ sort_parity_stripes(bbio, num_stripes);
+
tgtdev_indexes = 0;
if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
dev_replace->tgtdev != NULL) {
@@ -5427,6 +5457,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
*bbio_ret = bbio;
+ bbio->map_type = map->type;
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
@@ -5443,10 +5474,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
bbio->mirror_num = map->num_stripes + 1;
}
- if (raid_map) {
- sort_parity_stripes(bbio, raid_map);
- *raid_map_ret = raid_map;
- }
out:
if (dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
@@ -5459,17 +5486,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
struct btrfs_bio **bbio_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- mirror_num, NULL);
+ mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
- u64 **raid_map_ret)
+ int need_raid_map)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- mirror_num, raid_map_ret);
+ mirror_num, need_raid_map);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -5511,8 +5538,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
do_div(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
do_div(length, map->num_stripes);
- else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
do_div(length, nr_data_stripes(map));
rmap_len = map->stripe_len * nr_data_stripes(map);
}
@@ -5565,7 +5591,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e
bio_endio_nodec(bio, err);
else
bio_endio(bio, err);
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
static void btrfs_end_bio(struct bio *bio, int err)
@@ -5808,7 +5834,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
- u64 *raid_map = NULL;
int ret;
int dev_nr = 0;
int total_devs = 1;
@@ -5819,7 +5844,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
btrfs_bio_counter_inc_blocked(root->fs_info);
ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
- mirror_num, &raid_map);
+ mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(root->fs_info);
return ret;
@@ -5832,15 +5857,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
- if (raid_map) {
+ if (bbio->raid_map) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
- ret = raid56_parity_write(root, bio, bbio,
- raid_map, map_length);
+ ret = raid56_parity_write(root, bio, bbio, map_length);
} else {
- ret = raid56_parity_recover(root, bio, bbio,
- raid_map, map_length,
+ ret = raid56_parity_recover(root, bio, bbio, map_length,
mirror_num, 1);
}
@@ -6238,17 +6261,22 @@ int btrfs_read_sys_array(struct btrfs_root *root)
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
- u8 *ptr;
- unsigned long sb_ptr;
+ u8 *array_ptr;
+ unsigned long sb_array_offset;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
- u32 cur;
+ u32 cur_offset;
struct btrfs_key key;
- sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
- BTRFS_SUPER_INFO_SIZE);
+ ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
+ /*
+ * This will create extent buffer of nodesize, superblock size is
+ * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
+ * overallocate but we can keep it as-is, only the first page is used.
+ */
+ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
btrfs_set_buffer_uptodate(sb);
@@ -6271,35 +6299,56 @@ int btrfs_read_sys_array(struct btrfs_root *root)
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
- ptr = super_copy->sys_chunk_array;
- sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
- cur = 0;
+ array_ptr = super_copy->sys_chunk_array;
+ sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
+ cur_offset = 0;
+
+ while (cur_offset < array_size) {
+ disk_key = (struct btrfs_disk_key *)array_ptr;
+ len = sizeof(*disk_key);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
- while (cur < array_size) {
- disk_key = (struct btrfs_disk_key *)ptr;
btrfs_disk_key_to_cpu(&key, disk_key);
- len = sizeof(*disk_key); ptr += len;
- sb_ptr += len;
- cur += len;
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
- chunk = (struct btrfs_chunk *)sb_ptr;
+ chunk = (struct btrfs_chunk *)sb_array_offset;
+ /*
+ * At least one btrfs_chunk with one stripe must be
+ * present, exact stripe count check comes afterwards
+ */
+ len = btrfs_chunk_item_size(1);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ len = btrfs_chunk_item_size(num_stripes);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
ret = read_one_chunk(root, &key, sb, chunk);
if (ret)
break;
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- len = btrfs_chunk_item_size(num_stripes);
} else {
ret = -EIO;
break;
}
- ptr += len;
- sb_ptr += len;
- cur += len;
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
}
free_extent_buffer(sb);
return ret;
+
+out_short_read:
+ printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
+ len, cur_offset);
+ free_extent_buffer(sb);
+ return -EIO;
}
int btrfs_read_chunk_tree(struct btrfs_root *root)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d6fe73c..83069de 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
struct btrfs_bio {
+ atomic_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
+ u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
unsigned long flags;
@@ -307,6 +309,12 @@ struct btrfs_bio {
int mirror_num;
int num_tgtdevs;
int *tgtdev_map;
+ /*
+ * logical block numbers for the start of each stripe
+ * The last one or two are p/q. These are sorted,
+ * so raid_map[0] is the start of our full stripe
+ */
+ u64 *raid_map;
struct btrfs_bio_stripe stripes[];
};
@@ -388,19 +396,15 @@ struct btrfs_balance_control {
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
-
-#define btrfs_bio_size(total_stripes, real_stripes) \
- (sizeof(struct btrfs_bio) + \
- (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
- (sizeof(int) * (real_stripes)))
-
+void btrfs_get_bbio(struct btrfs_bio *bbio);
+void btrfs_put_bbio(struct btrfs_bio *bbio);
int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
- u64 **raid_map_ret);
+ int need_raid_map);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 47b1946..883b936 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -111,6 +111,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
name, name_len, -1);
if (!di && (flags & XATTR_REPLACE))
ret = -ENODATA;
+ else if (IS_ERR(di))
+ ret = PTR_ERR(di);
else if (di)
ret = btrfs_delete_one_dir_name(trans, root, path, di);
goto out;
@@ -127,10 +129,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
ASSERT(mutex_is_locked(&inode->i_mutex));
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
name, name_len, 0);
- if (!di) {
+ if (!di)
ret = -ENODATA;
+ else if (IS_ERR(di))
+ ret = PTR_ERR(di);
+ if (ret)
goto out;
- }
btrfs_release_path(path);
di = NULL;
}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index ce1b115..f601def 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -574,7 +574,7 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
/* extract the directory dentry from the cwd */
get_fs_pwd(current->fs, &path);
- if (!S_ISDIR(path.dentry->d_inode->i_mode))
+ if (!d_can_lookup(path.dentry))
goto notdir;
cachefiles_begin_secure(cache, &saved_cred);
@@ -646,7 +646,7 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
/* extract the directory dentry from the cwd */
get_fs_pwd(current->fs, &path);
- if (!S_ISDIR(path.dentry->d_inode->i_mode))
+ if (!d_can_lookup(path.dentry))
goto notdir;
cachefiles_begin_secure(cache, &saved_cred);
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1c7293c..2324262 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -437,7 +437,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
if (!object->backer)
return -ENOBUFS;
- ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+ ASSERT(d_is_reg(object->backer));
fscache_set_store_limit(&object->fscache, ni_size);
@@ -501,7 +501,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op)
op->object->debug_id, (unsigned long long)ni_size);
if (object->backer) {
- ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+ ASSERT(d_is_reg(object->backer));
fscache_set_store_limit(&object->fscache, ni_size);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7f8e83f..1e51714e 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -277,7 +277,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
_debug("remove %p from %p", rep, dir);
/* non-directories can just be unlinked */
- if (!S_ISDIR(rep->d_inode->i_mode)) {
+ if (!d_is_dir(rep)) {
_debug("unlink stale object");
path.mnt = cache->mnt;
@@ -323,7 +323,7 @@ try_again:
return 0;
}
- if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+ if (!d_can_lookup(cache->graveyard)) {
unlock_rename(cache->graveyard, dir);
cachefiles_io_error(cache, "Graveyard no longer a directory");
return -EIO;
@@ -475,7 +475,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
ASSERT(parent->dentry);
ASSERT(parent->dentry->d_inode);
- if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
+ if (!(d_is_dir(parent->dentry))) {
// TODO: convert file to dir
_leave("looking up in none directory");
return -ENOBUFS;
@@ -539,7 +539,7 @@ lookup_again:
_debug("mkdir -> %p{%p{ino=%lu}}",
next, next->d_inode, next->d_inode->i_ino);
- } else if (!S_ISDIR(next->d_inode->i_mode)) {
+ } else if (!d_can_lookup(next)) {
pr_err("inode %lu is not a directory\n",
next->d_inode->i_ino);
ret = -ENOBUFS;
@@ -568,8 +568,8 @@ lookup_again:
_debug("create -> %p{%p{ino=%lu}}",
next, next->d_inode, next->d_inode->i_ino);
- } else if (!S_ISDIR(next->d_inode->i_mode) &&
- !S_ISREG(next->d_inode->i_mode)
+ } else if (!d_can_lookup(next) &&
+ !d_is_reg(next)
) {
pr_err("inode %lu is not a file or directory\n",
next->d_inode->i_ino);
@@ -642,7 +642,7 @@ lookup_again:
/* open a file interface onto a data file */
if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
- if (S_ISREG(object->dentry->d_inode->i_mode)) {
+ if (d_is_reg(object->dentry)) {
const struct address_space_operations *aops;
ret = -EPERM;
@@ -763,7 +763,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
/* we need to make sure the subdir is a directory */
ASSERT(subdir->d_inode);
- if (!S_ISDIR(subdir->d_inode->i_mode)) {
+ if (!d_can_lookup(subdir)) {
pr_err("%s is not a directory\n", dirname);
ret = -EIO;
goto check_error;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 616db0e7..c6cd8d7 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -900,7 +900,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
return -ENOBUFS;
}
- ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+ ASSERT(d_is_reg(object->backer));
cache = container_of(object->fscache.cache,
struct cachefiles_cache, cache);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 5bd853b..64fa248 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
}
-static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
- int type)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct posix_acl *acl = ACL_NOT_CACHED;
-
- spin_lock(&ci->i_ceph_lock);
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
- acl = get_cached_acl(inode, type);
- spin_unlock(&ci->i_ceph_lock);
-
- return acl;
-}
-
struct posix_acl *ceph_get_acl(struct inode *inode, int type)
{
int size;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 24be059..fd5599d 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -196,17 +196,22 @@ static int readpage_nounlock(struct file *filp, struct page *page)
u64 len = PAGE_CACHE_SIZE;
if (off >= i_size_read(inode)) {
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
return 0;
}
- /*
- * Uptodate inline data should have been added into page cache
- * while getting Fcr caps.
- */
- if (ci->i_inline_version != CEPH_INLINE_NONE)
- return -EINVAL;
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ /*
+ * Uptodate inline data should have been added
+ * into page cache while getting Fcr caps.
+ */
+ if (off == 0)
+ return -EINVAL;
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ return 0;
+ }
err = ceph_readpage_from_fscache(inode, page);
if (err == 0)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b93c631..8172775 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode,
struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
realmino);
if (realm) {
- ceph_get_snap_realm(mdsc, realm);
spin_lock(&realm->inodes_with_caps_lock);
ci->i_snap_realm = realm;
list_add(&ci->i_snap_realm_item,
@@ -1451,8 +1450,8 @@ static int __mark_caps_flushing(struct inode *inode,
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
if (list_empty(&ci->i_flushing_item)) {
+ ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
dout(" inode %p now flushing seq %lld\n", inode,
@@ -2073,17 +2072,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
* requested from the MDS.
*/
static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- loff_t endoff, int *got, struct page **pinned_page,
- int *check_max, int *err)
+ loff_t endoff, int *got, int *check_max, int *err)
{
struct inode *inode = &ci->vfs_inode;
int ret = 0;
- int have, implemented, _got = 0;
+ int have, implemented;
int file_wanted;
dout("get_cap_refs %p need %s want %s\n", inode,
ceph_cap_string(need), ceph_cap_string(want));
-again:
+
spin_lock(&ci->i_ceph_lock);
/* make sure file is actually open */
@@ -2138,50 +2136,34 @@ again:
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
if ((revoking & not) == 0) {
- _got = need | (have & want);
- __take_cap_refs(ci, _got);
+ *got = need | (have & want);
+ __take_cap_refs(ci, *got);
ret = 1;
}
} else {
+ int session_readonly = false;
+ if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+ struct ceph_mds_session *s = ci->i_auth_cap->session;
+ spin_lock(&s->s_cap_lock);
+ session_readonly = s->s_readonly;
+ spin_unlock(&s->s_cap_lock);
+ }
+ if (session_readonly) {
+ dout("get_cap_refs %p needed %s but mds%d readonly\n",
+ inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+ *err = -EROFS;
+ ret = 1;
+ goto out_unlock;
+ }
+
dout("get_cap_refs %p have %s needed %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
out_unlock:
spin_unlock(&ci->i_ceph_lock);
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
- (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- i_size_read(inode) > 0) {
- int ret1;
- struct page *page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- *pinned_page = page;
- goto out;
- }
- page_cache_release(page);
- }
- /*
- * drop cap refs first because getattr while holding
- * caps refs can cause deadlock.
- */
- ceph_put_cap_refs(ci, _got);
- _got = 0;
-
- /* getattr request will bring inline data into page cache */
- ret1 = __ceph_do_getattr(inode, NULL,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (ret1 >= 0) {
- ret = 0;
- goto again;
- }
- *err = ret1;
- ret = 1;
- }
-out:
dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(_got));
- *got = _got;
+ ret, ceph_cap_string(*got));
return ret;
}
@@ -2221,22 +2203,52 @@ static void check_max_size(struct inode *inode, loff_t endoff)
int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page)
{
- int check_max, ret, err;
+ int _got, check_max, ret, err = 0;
retry:
if (endoff > 0)
check_max_size(&ci->vfs_inode, endoff);
+ _got = 0;
check_max = 0;
- err = 0;
ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want, endoff,
- got, pinned_page,
- &check_max, &err));
+ try_get_cap_refs(ci, need, want, endoff,
+ &_got, &check_max, &err));
if (err)
ret = err;
+ if (ret < 0)
+ return ret;
+
if (check_max)
goto retry;
- return ret;
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(&ci->vfs_inode) > 0) {
+ struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ *pinned_page = page;
+ goto out;
+ }
+ page_cache_release(page);
+ }
+ /*
+ * drop cap refs first because getattr while holding
+ * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /* getattr request will bring inline data into page cache */
+ ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (ret < 0)
+ return ret;
+ goto retry;
+ }
+out:
+ *got = _got;
+ return 0;
}
/*
@@ -2432,13 +2444,13 @@ static void invalidate_aliases(struct inode *inode)
*/
static void handle_cap_grant(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *grant,
- void *snaptrace, int snaptrace_len,
u64 inline_version,
void *inline_data, int inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
struct ceph_cap *cap, int issued)
__releases(ci->i_ceph_lock)
+ __releases(mdsc->snap_rwsem)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
@@ -2639,10 +2651,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
spin_unlock(&ci->i_ceph_lock);
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace,
- snaptrace + snaptrace_len, false);
- downgrade_write(&mdsc->snap_rwsem);
kick_flushing_inode_caps(mdsc, session, inode);
up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued)
@@ -3052,6 +3060,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_cap *cap;
struct ceph_mds_caps *h;
struct ceph_mds_cap_peer *peer = NULL;
+ struct ceph_snap_realm *realm;
int mds = session->s_mds;
int op, issued;
u32 seq, mseq;
@@ -3153,11 +3162,23 @@ void ceph_handle_caps(struct ceph_mds_session *session,
goto done_unlocked;
case CEPH_CAP_OP_IMPORT:
+ realm = NULL;
+ if (snaptrace_len) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace,
+ snaptrace + snaptrace_len,
+ false, &realm);
+ downgrade_write(&mdsc->snap_rwsem);
+ } else {
+ down_read(&mdsc->snap_rwsem);
+ }
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &issued);
- handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len,
+ handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
msg->middle, session, cap, issued);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
}
@@ -3177,7 +3198,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_GRANT:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- handle_cap_grant(mdsc, inode, h, NULL, 0,
+ handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
msg->middle, session, cap, issued);
goto done_unlocked;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c241603..83e9976 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -26,8 +26,6 @@
* point by name.
*/
-const struct inode_operations ceph_dir_iops;
-const struct file_operations ceph_dir_fops;
const struct dentry_operations ceph_dentry_ops;
/*
@@ -672,13 +670,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
/*
* We created the item, then did a lookup, and found
* it was already linked to another inode we already
- * had in our cache (and thus got spliced). Link our
- * dentry to that inode, but don't hash it, just in
- * case the VFS wants to dereference it.
+ * had in our cache (and thus got spliced). To not
+ * confuse VFS (especially when inode is a directory),
+ * we don't link our dentry to that inode, return an
+ * error instead.
+ *
+ * This event should be rare and it happens only when
+ * we talk to old MDS. Recent MDS does not send traceless
+ * reply for request that creates new inode.
*/
- BUG_ON(!result->d_inode);
- d_instantiate(dentry, result->d_inode);
- return 0;
+ d_drop(result);
+ return -ESTALE;
}
return PTR_ERR(result);
}
@@ -902,7 +904,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
} else if (ceph_snap(dir) == CEPH_NOSNAP) {
dout("unlink/rmdir dir %p dn %p inode %p\n",
dir, dentry, inode);
- op = S_ISDIR(dentry->d_inode->i_mode) ?
+ op = d_is_dir(dentry) ?
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
@@ -1335,6 +1337,13 @@ const struct file_operations ceph_dir_fops = {
.fsync = ceph_dir_fsync,
};
+const struct file_operations ceph_snapdir_fops = {
+ .iterate = ceph_readdir,
+ .llseek = ceph_dir_llseek,
+ .open = ceph_open,
+ .release = ceph_release,
+};
+
const struct inode_operations ceph_dir_iops = {
.lookup = ceph_lookup,
.permission = ceph_permission,
@@ -1357,6 +1366,14 @@ const struct inode_operations ceph_dir_iops = {
.atomic_open = ceph_atomic_open,
};
+const struct inode_operations ceph_snapdir_iops = {
+ .lookup = ceph_lookup,
+ .permission = ceph_permission,
+ .getattr = ceph_getattr,
+ .mkdir = ceph_mkdir,
+ .rmdir = ceph_unlink,
+};
+
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
.d_release = ceph_d_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 98e257c..139f2fe 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -274,10 +274,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
+ err = ceph_handle_snapdir(req, dentry, err);
if (err)
goto out_req;
- err = ceph_handle_snapdir(req, dentry, err);
if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
@@ -291,7 +291,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
}
if (err)
goto out_req;
- if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
+ if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) {
/* make vfs retry on splice, ENOENT, or symlink */
dout("atomic_open finish_no_open on dn %p\n", dn);
err = finish_no_open(file, dn);
@@ -391,13 +391,14 @@ more:
if (ret >= 0) {
int didpages;
if (was_short && (pos + ret < inode->i_size)) {
- u64 tmp = min(this_len - ret,
- inode->i_size - pos - ret);
+ int zlen = min(this_len - ret,
+ inode->i_size - pos - ret);
+ int zoff = (o_direct ? buf_align : io_align) +
+ read + ret;
dout(" zero gap %llu to %llu\n",
- pos + ret, pos + ret + tmp);
- ceph_zero_page_vector_range(page_align + read + ret,
- tmp, pages);
- ret += tmp;
+ pos + ret, pos + ret + zlen);
+ ceph_zero_page_vector_range(zoff, zlen, pages);
+ ret += zlen;
}
didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
@@ -877,28 +878,34 @@ again:
i_size = i_size_read(inode);
if (retry_op == READ_INLINE) {
- /* does not support inline data > PAGE_SIZE */
- if (i_size > PAGE_CACHE_SIZE) {
- ret = -EIO;
- } else if (iocb->ki_pos < i_size) {
+ BUG_ON(ret > 0 || read > 0);
+ if (iocb->ki_pos < i_size &&
+ iocb->ki_pos < PAGE_CACHE_SIZE) {
loff_t end = min_t(loff_t, i_size,
iocb->ki_pos + len);
+ end = min_t(loff_t, end, PAGE_CACHE_SIZE);
if (statret < end)
zero_user_segment(page, statret, end);
ret = copy_page_to_iter(page,
iocb->ki_pos & ~PAGE_MASK,
end - iocb->ki_pos, to);
iocb->ki_pos += ret;
- } else {
- ret = 0;
+ read += ret;
+ }
+ if (iocb->ki_pos < i_size && read < len) {
+ size_t zlen = min_t(size_t, len - read,
+ i_size - iocb->ki_pos);
+ ret = iov_iter_zero(zlen, to);
+ iocb->ki_pos += ret;
+ read += ret;
}
__free_pages(page, 0);
- return ret;
+ return read;
}
/* hit EOF or hole? */
if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
- ret < len) {
+ ret < len) {
dout("sync_read hit hole, ppos %lld < size %lld"
", reading more\n", iocb->ki_pos,
inode->i_size);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6b51736..119c43c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -82,8 +82,8 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
- inode->i_op = &ceph_dir_iops;
- inode->i_fop = &ceph_dir_fops;
+ inode->i_op = &ceph_snapdir_iops;
+ inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
return inode;
@@ -838,30 +838,31 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ceph_vinop(inode), inode->i_mode);
}
- /* set dir completion flag? */
- if (S_ISDIR(inode->i_mode) &&
- ci->i_files == 0 && ci->i_subdirs == 0 &&
- ceph_snap(inode) == CEPH_NOSNAP &&
- (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
- (issued & CEPH_CAP_FILE_EXCL) == 0 &&
- !__ceph_dir_is_complete(ci)) {
- dout(" marking %p complete (empty)\n", inode);
- __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count),
- ci->i_ordered_count);
- }
-
/* were we issued a capability? */
if (info->cap.caps) {
if (ceph_snap(inode) == CEPH_NOSNAP) {
+ unsigned caps = le32_to_cpu(info->cap.caps);
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode,
- le32_to_cpu(info->cap.caps),
+ cap_fmode, caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
le64_to_cpu(info->cap.realm),
info->cap.flags, &new_cap);
+
+ /* set dir completion flag? */
+ if (S_ISDIR(inode->i_mode) &&
+ ci->i_files == 0 && ci->i_subdirs == 0 &&
+ (caps & CEPH_CAP_FILE_SHARED) &&
+ (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+ !__ceph_dir_is_complete(ci)) {
+ dout(" marking %p complete (empty)\n", inode);
+ __ceph_dir_set_complete(ci,
+ atomic_read(&ci->i_release_count),
+ ci->i_ordered_count);
+ }
+
wake = true;
} else {
dout(" %p got snap_caps %s\n", inode,
@@ -1446,12 +1447,14 @@ retry_lookup:
}
if (!dn->d_inode) {
- dn = splice_dentry(dn, in, NULL);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
+ struct dentry *realdn = splice_dentry(dn, in, NULL);
+ if (IS_ERR(realdn)) {
+ err = PTR_ERR(realdn);
+ d_drop(dn);
dn = NULL;
goto next_item;
}
+ dn = realdn;
}
di = dn->d_fsdata;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 06ea5cd..4347039 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -245,6 +245,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
*/
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
{
+ struct file_lock *lock;
struct file_lock_context *ctx;
*fcntl_count = 0;
@@ -252,8 +253,12 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
ctx = inode->i_flctx;
if (ctx) {
- *fcntl_count = ctx->flc_posix_cnt;
- *flock_count = ctx->flc_flock_cnt;
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+ ++(*fcntl_count);
+ list_for_each_entry(lock, &ctx->flc_flock, fl_list)
+ ++(*flock_count);
+ spin_unlock(&ctx->flc_lock);
}
dout("counted %d flock locks and %d fcntl locks",
*flock_count, *fcntl_count);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5f62fb7..71c073f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -480,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
mdsc->max_sessions = newmax;
}
mdsc->sessions[mds] = s;
+ atomic_inc(&mdsc->num_sessions);
atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
@@ -503,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
mdsc->sessions[s->s_mds] = NULL;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
+ atomic_dec(&mdsc->num_sessions);
}
/*
@@ -842,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
struct ceph_options *opt = mdsc->fsc->client->options;
void *p;
- const char* metadata[3][2] = {
+ const char* metadata[][2] = {
{"hostname", utsname()->nodename},
+ {"kernel_version", utsname()->release},
{"entity_id", opt->name ? opt->name : ""},
{NULL, NULL}
};
@@ -1464,19 +1467,33 @@ out_unlocked:
return err;
}
+static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_flushing_caps)
+ ret = ci->i_cap_flush_seq >= want_flush_seq;
+ else
+ ret = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
/*
* flush all dirty inode data to disk.
*
* returns true if we've flushed through want_flush_seq
*/
-static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
{
- int mds, ret = 1;
+ int mds;
dout("check_cap_flush want %lld\n", want_flush_seq);
mutex_lock(&mdsc->mutex);
- for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
struct ceph_mds_session *session = mdsc->sessions[mds];
+ struct inode *inode = NULL;
if (!session)
continue;
@@ -1489,29 +1506,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
list_entry(session->s_cap_flushing.next,
struct ceph_inode_info,
i_flushing_item);
- struct inode *inode = &ci->vfs_inode;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_cap_flush_seq <= want_flush_seq) {
+ if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n", inode,
- ci->i_cap_flush_seq, want_flush_seq,
- session->s_mds);
- ret = 0;
+ "seq %lld <= %lld to mds%d\n",
+ &ci->vfs_inode, ci->i_cap_flush_seq,
+ want_flush_seq, session->s_mds);
+ inode = igrab(&ci->vfs_inode);
}
- spin_unlock(&ci->i_ceph_lock);
}
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
- if (!ret)
- return ret;
+ if (inode) {
+ wait_event(mdsc->cap_flushing_wq,
+ check_cap_flush(inode, want_flush_seq));
+ iput(inode);
+ }
+
mutex_lock(&mdsc->mutex);
}
mutex_unlock(&mdsc->mutex);
dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
- return ret;
}
/*
@@ -1923,7 +1940,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->num_releases = cpu_to_le16(releases);
/* time stamp */
- ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+ {
+ struct ceph_timespec ts;
+ ceph_encode_timespec(&ts, &req->r_stamp);
+ ceph_encode_copy(&p, &ts, sizeof(ts));
+ }
BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
@@ -2012,7 +2033,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
/* time stamp */
p = msg->front.iov_base + req->r_request_release_offset;
- ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+ {
+ struct ceph_timespec ts;
+ ceph_encode_timespec(&ts, &req->r_stamp);
+ ceph_encode_copy(&p, &ts, sizeof(ts));
+ }
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2159,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
p = rb_next(p);
if (req->r_got_unsafe)
continue;
+ if (req->r_attempts > 0)
+ continue; /* only new requests */
if (req->r_session &&
req->r_session->s_mds == mds) {
dout(" kicking tid %llu\n", req->r_tid);
@@ -2286,6 +2313,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
struct ceph_mds_request *req;
struct ceph_mds_reply_head *head = msg->front.iov_base;
struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
+ struct ceph_snap_realm *realm;
u64 tid;
int err, result;
int mds = session->s_mds;
@@ -2401,11 +2429,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
}
/* snap trace */
+ realm = NULL;
if (rinfo->snapblob_len) {
down_write(&mdsc->snap_rwsem);
ceph_update_snap_trace(mdsc, rinfo->snapblob,
- rinfo->snapblob + rinfo->snapblob_len,
- le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+ rinfo->snapblob + rinfo->snapblob_len,
+ le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
+ &realm);
downgrade_write(&mdsc->snap_rwsem);
} else {
down_read(&mdsc->snap_rwsem);
@@ -2423,6 +2453,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&req->r_fill_mutex);
up_read(&mdsc->snap_rwsem);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
@@ -2487,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
BUG_ON(req->r_err);
BUG_ON(req->r_got_result);
+ req->r_attempts = 0;
req->r_num_fwd = fwd_seq;
req->r_resend_mds = next_mds;
put_request_session(req);
@@ -2580,6 +2613,14 @@ static void handle_session(struct ceph_mds_session *session,
send_flushmsg_ack(mdsc, session, seq);
break;
+ case CEPH_SESSION_FORCE_RO:
+ dout("force_session_readonly %p\n", session);
+ spin_lock(&session->s_cap_lock);
+ session->s_readonly = true;
+ spin_unlock(&session->s_cap_lock);
+ wake_up_session_caps(session, 0);
+ break;
+
default:
pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
WARN_ON(1);
@@ -2610,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_mds_request *req, *nreq;
+ struct rb_node *p;
int err;
dout("replay_unsafe_requests mds%d\n", session->s_mds);
@@ -2622,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
ceph_con_send(&session->s_con, req->r_request);
}
}
+
+ /*
+ * also re-send old requests when MDS enters reconnect stage. So that MDS
+ * can process completed request in clientreplay stage.
+ */
+ p = rb_first(&mdsc->request_tree);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (req->r_got_unsafe)
+ continue;
+ if (req->r_attempts == 0)
+ continue; /* only old requests */
+ if (req->r_session &&
+ req->r_session->s_mds == session->s_mds) {
+ err = __prepare_send_request(mdsc, req, session->s_mds);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+ }
+ }
mutex_unlock(&mdsc->mutex);
}
@@ -2787,6 +2851,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
spin_unlock(&session->s_gen_ttl_lock);
spin_lock(&session->s_cap_lock);
+ /* don't know if session is readonly */
+ session->s_readonly = 0;
/*
* notify __ceph_remove_cap() that we are composing cap reconnect.
* If a cap get released before being added to the cap reconnect,
@@ -2933,9 +2999,6 @@ static void check_new_map(struct ceph_mds_client *mdsc,
mutex_unlock(&s->s_mutex);
s->s_state = CEPH_MDS_SESSION_RESTARTING;
}
-
- /* kick any requests waiting on the recovering mds */
- kick_requests(mdsc, i);
} else if (oldstate == newstate) {
continue; /* nothing new with this mds */
}
@@ -3295,6 +3358,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
mdsc->sessions = NULL;
+ atomic_set(&mdsc->num_sessions, 0);
mdsc->max_sessions = 0;
mdsc->stopping = 0;
init_rwsem(&mdsc->snap_rwsem);
@@ -3428,14 +3492,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
dout("sync\n");
mutex_lock(&mdsc->mutex);
want_tid = mdsc->last_tid;
- want_flush = mdsc->cap_flush_seq;
mutex_unlock(&mdsc->mutex);
- dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
ceph_flush_dirty_caps(mdsc);
+ spin_lock(&mdsc->cap_dirty_lock);
+ want_flush = mdsc->cap_flush_seq;
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
wait_unsafe_requests(mdsc, want_tid);
- wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+ wait_caps_flush(mdsc, want_flush);
}
/*
@@ -3443,17 +3510,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
*/
static bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
- int i, n = 0;
-
if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
return true;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++)
- if (mdsc->sessions[i])
- n++;
- mutex_unlock(&mdsc->mutex);
- return n == 0;
+ return atomic_read(&mdsc->num_sessions) == 0;
}
/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e2817d0..1875b5d 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -137,6 +137,7 @@ struct ceph_mds_session {
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
int s_cap_reconnect;
+ int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator;
@@ -272,6 +273,7 @@ struct ceph_mds_client {
struct list_head waiting_for_map;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
+ atomic_t num_sessions;
int max_sessions; /* len of s_mds_sessions */
int stopping; /* true if shutting down */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ce35fbd..a97e39f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
* safe. we do need to protect against concurrent empty list
* additions, however.
*/
- if (atomic_read(&realm->nref) == 0) {
+ if (atomic_inc_return(&realm->nref) == 1) {
spin_lock(&mdsc->snap_empty_lock);
list_del_init(&realm->empty_item);
spin_unlock(&mdsc->snap_empty_lock);
}
-
- atomic_inc(&realm->nref);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 0); /* tree does not take a ref */
+ atomic_set(&realm->nref, 1); /* for caller */
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
@@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
*
* caller must hold snap_rwsem for write.
*/
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
- u64 ino)
+static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
{
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
@@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
return NULL;
}
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *r;
+ r = __lookup_snap_realm(mdsc, ino);
+ if (r)
+ ceph_get_snap_realm(mdsc, r);
+ return r;
+}
+
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
@@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
}
realm->parent_ino = parentino;
realm->parent = parent;
- ceph_get_snap_realm(mdsc, parent);
list_add(&realm->child_item, &parent->children);
return 1;
}
@@ -631,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
* Caller must hold snap_rwsem for write.
*/
int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
- void *p, void *e, bool deletion)
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret)
{
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm;
+ struct ceph_snap_realm *realm = NULL;
+ struct ceph_snap_realm *first_realm = NULL;
int invalidate = 0;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
@@ -704,13 +713,18 @@ more:
dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
realm, invalidate, p, e);
- if (p < e)
- goto more;
-
/* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate)
+ if (invalidate && p >= e)
rebuild_snap_realms(realm);
+ if (!first_realm)
+ first_realm = realm;
+ else
+ ceph_put_snap_realm(mdsc, realm);
+
+ if (p < e)
+ goto more;
+
/*
* queue cap snaps _after_ we've built the new snap contexts,
* so that i_head_snapc can be set appropriately.
@@ -721,12 +735,21 @@ more:
queue_realm_cap_snaps(realm);
}
+ if (realm_ret)
+ *realm_ret = first_realm;
+ else
+ ceph_put_snap_realm(mdsc, first_realm);
+
__cleanup_empty_realms(mdsc);
return 0;
bad:
err = -EINVAL;
fail:
+ if (realm && !IS_ERR(realm))
+ ceph_put_snap_realm(mdsc, realm);
+ if (first_realm)
+ ceph_put_snap_realm(mdsc, first_realm);
pr_err("update_snap_trace error %d\n", err);
return err;
}
@@ -844,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
if (IS_ERR(realm))
goto out;
}
- ceph_get_snap_realm(mdsc, realm);
dout("splitting snap_realm %llx %p\n", realm->ino, realm);
for (i = 0; i < num_split_inos; i++) {
@@ -905,7 +927,7 @@ skip_inode:
/* we may have taken some of the old realm's children. */
for (i = 0; i < num_split_realms; i++) {
struct ceph_snap_realm *child =
- ceph_lookup_snap_realm(mdsc,
+ __lookup_snap_realm(mdsc,
le64_to_cpu(split_realms[i]));
if (!child)
continue;
@@ -918,7 +940,7 @@ skip_inode:
* snap, we can avoid queueing cap_snaps.
*/
ceph_update_snap_trace(mdsc, p, e,
- op == CEPH_SNAP_OP_DESTROY);
+ op == CEPH_SNAP_OP_DESTROY, NULL);
if (op == CEPH_SNAP_OP_SPLIT)
/* we took a reference when we created the realm, above */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 5ae6258..a63997b 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -414,6 +414,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noshare");
if (opt->flags & CEPH_OPT_NOCRC)
seq_puts(m, ",nocrc");
+ if (opt->flags & CEPH_OPT_NOMSGAUTH)
+ seq_puts(m, ",nocephx_require_signatures");
+ if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
+ seq_puts(m, ",notcp_nodelay");
if (opt->name)
seq_printf(m, ",name=%s", opt->name);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e1aa32d..04c8124 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -693,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
extern int ceph_update_snap_trace(struct ceph_mds_client *m,
- void *p, void *e, bool deletion);
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret);
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
@@ -892,7 +893,9 @@ extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
int ceph_uninline_data(struct file *filp, struct page *locked_page);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
+extern const struct file_operations ceph_snapdir_fops;
extern const struct inode_operations ceph_dir_iops;
+extern const struct inode_operations ceph_snapdir_iops;
extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
ceph_snapdir_dentry_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8fe1f7a..a94b3e6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1129,7 +1129,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock;
struct file_lock_context *flctx = inode->i_flctx;
- unsigned int i;
+ unsigned int count = 0, i;
int rc = 0, xid, type;
struct list_head locks_to_send, *el;
struct lock_to_push *lck, *tmp;
@@ -1140,14 +1140,20 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
if (!flctx)
goto out;
+ spin_lock(&flctx->flc_lock);
+ list_for_each(el, &flctx->flc_posix) {
+ count++;
+ }
+ spin_unlock(&flctx->flc_lock);
+
INIT_LIST_HEAD(&locks_to_send);
/*
- * Allocating flc_posix_cnt locks is enough because no FL_POSIX locks
- * can be added to the list while we are holding cinode->lock_sem that
+ * Allocating count locks is enough because no FL_POSIX locks can be
+ * added to the list while we are holding cinode->lock_sem that
* protects locking operations of this inode.
*/
- for (i = 0; i < flctx->flc_posix_cnt; i++) {
+ for (i = 0; i < count; i++) {
lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
if (!lck) {
rc = -ENOMEM;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 281ee01..60cb88c 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -304,7 +304,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
(const char *) old_name, (const char *)new_name);
if (!error) {
if (new_dentry->d_inode) {
- if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+ if (d_is_dir(new_dentry)) {
coda_dir_drop_nlink(old_dir);
coda_dir_inc_nlink(new_dir);
}
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index a315677..b65d1ef 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -69,14 +69,13 @@ extern struct kmem_cache *configfs_dir_cachep;
extern int configfs_is_root(struct config_item *item);
extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
-extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
+extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *));
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
struct dentry *, void *, umode_t, int);
extern int configfs_dirent_is_ready(struct configfs_dirent *);
-extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index c9c298b..cf0db00 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -240,60 +240,26 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
return 0;
}
-static int init_dir(struct inode * inode)
+static void init_dir(struct inode * inode)
{
inode->i_op = &configfs_dir_inode_operations;
inode->i_fop = &configfs_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
- return 0;
}
-static int configfs_init_file(struct inode * inode)
+static void configfs_init_file(struct inode * inode)
{
inode->i_size = PAGE_SIZE;
inode->i_fop = &configfs_file_operations;
- return 0;
}
-static int init_symlink(struct inode * inode)
+static void init_symlink(struct inode * inode)
{
inode->i_op = &configfs_symlink_inode_operations;
- return 0;
-}
-
-static int create_dir(struct config_item *k, struct dentry *d)
-{
- int error;
- umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
- struct dentry *p = d->d_parent;
-
- BUG_ON(!k);
-
- error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
- if (!error)
- error = configfs_make_dirent(p->d_fsdata, d, k, mode,
- CONFIGFS_DIR | CONFIGFS_USET_CREATING);
- if (!error) {
- configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
- error = configfs_create(d, mode, init_dir);
- if (!error) {
- inc_nlink(p->d_inode);
- } else {
- struct configfs_dirent *sd = d->d_fsdata;
- if (sd) {
- spin_lock(&configfs_dirent_lock);
- list_del_init(&sd->s_sibling);
- spin_unlock(&configfs_dirent_lock);
- configfs_put(sd);
- }
- }
- }
- return error;
}
-
/**
* configfs_create_dir - create a directory for an config_item.
* @item: config_itemwe're creating directory for.
@@ -303,11 +269,37 @@ static int create_dir(struct config_item *k, struct dentry *d)
* until it is validated by configfs_dir_set_ready()
*/
-static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
+static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
{
- int error = create_dir(item, dentry);
- if (!error)
+ int error;
+ umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
+ struct dentry *p = dentry->d_parent;
+
+ BUG_ON(!item);
+
+ error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name);
+ if (unlikely(error))
+ return error;
+
+ error = configfs_make_dirent(p->d_fsdata, dentry, item, mode,
+ CONFIGFS_DIR | CONFIGFS_USET_CREATING);
+ if (unlikely(error))
+ return error;
+
+ configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata);
+ error = configfs_create(dentry, mode, init_dir);
+ if (!error) {
+ inc_nlink(p->d_inode);
item->ci_dentry = dentry;
+ } else {
+ struct configfs_dirent *sd = dentry->d_fsdata;
+ if (sd) {
+ spin_lock(&configfs_dirent_lock);
+ list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
+ configfs_put(sd);
+ }
+ }
return error;
}
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 1d1c41f..56d2cdc 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -313,21 +313,6 @@ const struct file_operations configfs_file_operations = {
.release = configfs_release,
};
-
-int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
-{
- struct configfs_dirent * parent_sd = dir->d_fsdata;
- umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
- int error = 0;
-
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
- error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
- mutex_unlock(&dir->d_inode->i_mutex);
-
- return error;
-}
-
-
/**
* configfs_create_file - create an attribute file for an item.
* @item: item we're creating for.
@@ -336,9 +321,16 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att
int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
{
- BUG_ON(!item || !item->ci_dentry || !attr);
+ struct dentry *dir = item->ci_dentry;
+ struct configfs_dirent *parent_sd = dir->d_fsdata;
+ umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
+ int error = 0;
- return configfs_add_file(item->ci_dentry, attr,
- CONFIGFS_ITEM_ATTR);
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+ error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
+ CONFIGFS_ITEM_ATTR);
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ return error;
}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 65af861..5423a6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -176,7 +176,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
#endif /* CONFIG_LOCKDEP */
-int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *))
+int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct inode *))
{
int error = 0;
struct inode *inode = NULL;
@@ -198,13 +198,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct ino
p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
configfs_set_inode_lock_class(sd, inode);
- if (init) {
- error = init(inode);
- if (error) {
- iput(inode);
- return error;
- }
- }
+ init(inode);
d_instantiate(dentry, inode);
if (S_ISDIR(mode) || S_ISLNK(mode))
dget(dentry); /* pin link and directory dentries in core */
@@ -242,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
if (dentry) {
spin_lock(&dentry->d_lock);
- if (!(d_unhashed(dentry) && dentry->d_inode)) {
+ if (!d_unhashed(dentry) && dentry->d_inode) {
dget_dlock(dentry);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
diff --git a/fs/coredump.c b/fs/coredump.c
index b5c86ff..f319926 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -572,7 +572,7 @@ void do_coredump(const siginfo_t *siginfo)
*
* Normally core limits are irrelevant to pipes, since
* we're not writing to the file system, but we use
- * cprm.limit of 1 here as a speacial value, this is a
+ * cprm.limit of 1 here as a special value, this is a
* consistent way to catch recursive crashes.
* We can still crash if the core_pattern binary sets
* RLIM_CORE = !1, but it runs as root, and can do
diff --git a/fs/dcache.c b/fs/dcache.c
index dc400fd..d99736a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1659,9 +1659,25 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
}
EXPORT_SYMBOL(d_set_d_op);
+
+/*
+ * d_set_fallthru - Mark a dentry as falling through to a lower layer
+ * @dentry - The dentry to mark
+ *
+ * Mark a dentry as falling through to the lower layer (as set with
+ * d_pin_lower()). This flag may be recorded on the medium.
+ */
+void d_set_fallthru(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_FALLTHRU;
+ spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL(d_set_fallthru);
+
static unsigned d_flags_for_inode(struct inode *inode)
{
- unsigned add_flags = DCACHE_FILE_TYPE;
+ unsigned add_flags = DCACHE_REGULAR_TYPE;
if (!inode)
return DCACHE_MISS_TYPE;
@@ -1674,13 +1690,21 @@ static unsigned d_flags_for_inode(struct inode *inode)
else
inode->i_opflags |= IOP_LOOKUP;
}
- } else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
- if (unlikely(inode->i_op->follow_link))
+ goto type_determined;
+ }
+
+ if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+ if (unlikely(inode->i_op->follow_link)) {
add_flags = DCACHE_SYMLINK_TYPE;
- else
- inode->i_opflags |= IOP_NOFOLLOW;
+ goto type_determined;
+ }
+ inode->i_opflags |= IOP_NOFOLLOW;
}
+ if (unlikely(!S_ISREG(inode->i_mode)))
+ add_flags = DCACHE_SPECIAL_TYPE;
+
+type_determined:
if (unlikely(IS_AUTOMOUNT(inode)))
add_flags |= DCACHE_NEED_AUTOMOUNT;
return add_flags;
@@ -1691,7 +1715,8 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
unsigned add_flags = d_flags_for_inode(inode);
spin_lock(&dentry->d_lock);
- __d_set_type(dentry, add_flags);
+ dentry->d_flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ dentry->d_flags |= add_flags;
if (inode)
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
dentry->d_inode = inode;
@@ -2665,7 +2690,7 @@ static int __d_unalias(struct inode *inode,
struct dentry *dentry, struct dentry *alias)
{
struct mutex *m1 = NULL, *m2 = NULL;
- int ret = -EBUSY;
+ int ret = -ESTALE;
/* If alias and dentry share a parent, then no extra locks required */
if (alias->d_parent == dentry->d_parent)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 45b18a5..96400ab 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -169,10 +169,19 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static void debugfs_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_private);
+}
+
static const struct super_operations debugfs_super_operations = {
.statfs = simple_statfs,
.remount_fs = debugfs_remount,
.show_options = debugfs_show_options,
+ .evict_inode = debugfs_evict_inode,
};
static struct vfsmount *debugfs_automount(struct path *path)
@@ -511,23 +520,14 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
int ret = 0;
if (debugfs_positive(dentry)) {
- if (dentry->d_inode) {
- dget(dentry);
- switch (dentry->d_inode->i_mode & S_IFMT) {
- case S_IFDIR:
- ret = simple_rmdir(parent->d_inode, dentry);
- break;
- case S_IFLNK:
- kfree(dentry->d_inode->i_private);
- /* fall through */
- default:
- simple_unlink(parent->d_inode, dentry);
- break;
- }
- if (!ret)
- d_delete(dentry);
- dput(dentry);
- }
+ dget(dentry);
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ ret = simple_rmdir(parent->d_inode, dentry);
+ else
+ simple_unlink(parent->d_inode, dentry);
+ if (!ret)
+ d_delete(dentry);
+ dput(dentry);
}
return ret;
}
@@ -690,7 +690,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
}
d_move(old_dentry, dentry);
fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
- S_ISDIR(old_dentry->d_inode->i_mode),
+ d_is_dir(old_dentry),
NULL, old_dentry);
fsnotify_oldname_free(old_name);
unlock_rename(new_dir, old_dir);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 90d1882..5ba029e 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -124,7 +124,7 @@ ecryptfs_get_key_payload_data(struct key *key)
}
#define ECRYPTFS_MAX_KEYSET_SIZE 1024
-#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
+#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31
#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
#define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */
#define ECRYPTFS_SALT_BYTES 2
@@ -237,7 +237,7 @@ struct ecryptfs_crypt_stat {
struct crypto_ablkcipher *tfm;
struct crypto_hash *hash_tfm; /* Crypto context for generating
* the initialization vectors */
- unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
struct list_head keysig_list;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 273d36e..7967508 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -223,7 +223,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
}
ecryptfs_set_file_lower(
file, ecryptfs_inode_to_private(inode)->lower_file);
- if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+ if (d_is_dir(ecryptfs_dentry)) {
ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
mutex_lock(&crypt_stat->cs_mutex);
crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
@@ -296,9 +296,22 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct file *lower_file = ecryptfs_file_to_lower(file);
long rc = -ENOTTY;
- if (lower_file->f_op->unlocked_ioctl)
+ if (!lower_file->f_op->unlocked_ioctl)
+ return rc;
+
+ switch (cmd) {
+ case FITRIM:
+ case FS_IOC_GETFLAGS:
+ case FS_IOC_SETFLAGS:
+ case FS_IOC_GETVERSION:
+ case FS_IOC_SETVERSION:
rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
- return rc;
+ fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+
+ return rc;
+ default:
+ return rc;
+ }
}
#ifdef CONFIG_COMPAT
@@ -308,9 +321,22 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct file *lower_file = ecryptfs_file_to_lower(file);
long rc = -ENOIOCTLCMD;
- if (lower_file->f_op->compat_ioctl)
+ if (!lower_file->f_op->compat_ioctl)
+ return rc;
+
+ switch (cmd) {
+ case FITRIM:
+ case FS_IOC32_GETFLAGS:
+ case FS_IOC32_SETFLAGS:
+ case FS_IOC32_GETVERSION:
+ case FS_IOC32_SETVERSION:
rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
- return rc;
+ fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+
+ return rc;
+ default:
+ return rc;
+ }
}
#endif
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 34b36a5..b08b518 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -907,9 +907,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
lower_inode = ecryptfs_inode_to_lower(inode);
lower_dentry = ecryptfs_dentry_to_lower(dentry);
mutex_lock(&crypt_stat->cs_mutex);
- if (S_ISDIR(dentry->d_inode->i_mode))
+ if (d_is_dir(dentry))
crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
- else if (S_ISREG(dentry->d_inode->i_mode)
+ else if (d_is_reg(dentry)
&& (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)
|| !(crypt_stat->flags & ECRYPTFS_KEY_VALID))) {
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 917bd5c..6bd67e2 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -891,7 +891,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
struct blkcipher_desc desc;
char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
char iv[ECRYPTFS_MAX_IV_BYTES];
- char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+ char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
};
/**
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1895d60..c095d32 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -407,7 +407,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
if (!cipher_name_set) {
int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
- BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+ BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE);
strcpy(mount_crypt_stat->global_default_cipher_name,
ECRYPTFS_DEFAULT_CIPHER);
}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index fdfd206..714cd37 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -429,7 +429,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
if (IS_ERR(result))
return result;
- if (S_ISDIR(result->d_inode->i_mode)) {
+ if (d_is_dir(result)) {
/*
* This request is for a directory.
*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 982d934..f63c3d5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -364,7 +364,8 @@ struct flex_groups {
#define EXT4_DIRTY_FL 0x00000100
#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */
-#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */
+ /* nb: was previously EXT2_ECOMPR_FL */
+#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */
@@ -421,7 +422,7 @@ enum {
EXT4_INODE_DIRTY = 8,
EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
EXT4_INODE_NOCOMPR = 10, /* Don't compress */
- EXT4_INODE_ECOMPR = 11, /* Compression error */
+ EXT4_INODE_ENCRYPT = 11, /* Compression error */
/* End compression flags --- maybe not all used */
EXT4_INODE_INDEX = 12, /* hash-indexed directory */
EXT4_INODE_IMAGIC = 13, /* AFS directory */
@@ -466,7 +467,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(DIRTY);
CHECK_FLAG_VALUE(COMPRBLK);
CHECK_FLAG_VALUE(NOCOMPR);
- CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(ENCRYPT);
CHECK_FLAG_VALUE(INDEX);
CHECK_FLAG_VALUE(IMAGIC);
CHECK_FLAG_VALUE(JOURNAL_DATA);
@@ -1048,6 +1049,12 @@ extern void ext4_set_bits(void *bm, int cur, int len);
/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM 1
+/* Encryption algorithms */
+#define EXT4_ENCRYPTION_MODE_INVALID 0
+#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1
+#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2
+#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3
+
/*
* Structure of the super block
*/
@@ -1161,7 +1168,8 @@ struct ext4_super_block {
__le32 s_grp_quota_inum; /* inode for tracking group quota */
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
__le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
- __le32 s_reserved[106]; /* Padding to the end of the block */
+ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
+ __le32 s_reserved[105]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1527,6 +1535,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
* GDT_CSUM bits are mutually exclusive.
*/
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
+#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1542,6 +1551,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 8611640..740c787 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1401,10 +1401,7 @@ end_range:
* to free. Everything was covered by the start
* of the range.
*/
- return 0;
- } else {
- /* Shared branch grows from an indirect block */
- partial2--;
+ goto do_indirects;
}
} else {
/*
@@ -1435,56 +1432,96 @@ end_range:
/* Punch happened within the same level (n == n2) */
partial = ext4_find_shared(inode, n, offsets, chain, &nr);
partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
- /*
- * ext4_find_shared returns Indirect structure which
- * points to the last element which should not be
- * removed by truncate. But this is end of the range
- * in punch_hole so we need to point to the next element
- */
- partial2->p++;
- while ((partial > chain) || (partial2 > chain2)) {
- /* We're at the same block, so we're almost finished */
- if ((partial->bh && partial2->bh) &&
- (partial->bh->b_blocknr == partial2->bh->b_blocknr)) {
- if ((partial > chain) && (partial2 > chain2)) {
+
+ /* Free top, but only if partial2 isn't its subtree. */
+ if (nr) {
+ int level = min(partial - chain, partial2 - chain2);
+ int i;
+ int subtree = 1;
+
+ for (i = 0; i <= level; i++) {
+ if (offsets[i] != offsets2[i]) {
+ subtree = 0;
+ break;
+ }
+ }
+
+ if (!subtree) {
+ if (partial == chain) {
+ /* Shared branch grows from the inode */
+ ext4_free_branches(handle, inode, NULL,
+ &nr, &nr+1,
+ (chain+n-1) - partial);
+ *partial->p = 0;
+ } else {
+ /* Shared branch grows from an indirect block */
+ BUFFER_TRACE(partial->bh, "get_write_access");
ext4_free_branches(handle, inode, partial->bh,
- partial->p + 1,
- partial2->p,
+ partial->p,
+ partial->p+1,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
}
- return 0;
}
+ }
+
+ if (!nr2) {
/*
- * Clear the ends of indirect blocks on the shared branch
- * at the start of the range
+ * ext4_find_shared returns Indirect structure which
+ * points to the last element which should not be
+ * removed by truncate. But this is end of the range
+ * in punch_hole so we need to point to the next element
*/
- if (partial > chain) {
+ partial2->p++;
+ }
+
+ while (partial > chain || partial2 > chain2) {
+ int depth = (chain+n-1) - partial;
+ int depth2 = (chain2+n2-1) - partial2;
+
+ if (partial > chain && partial2 > chain2 &&
+ partial->bh->b_blocknr == partial2->bh->b_blocknr) {
+ /*
+ * We've converged on the same block. Clear the range,
+ * then we're done.
+ */
ext4_free_branches(handle, inode, partial->bh,
- partial->p + 1,
- (__le32 *)partial->bh->b_data+addr_per_block,
- (chain+n-1) - partial);
+ partial->p + 1,
+ partial2->p,
+ (chain+n-1) - partial);
BUFFER_TRACE(partial->bh, "call brelse");
brelse(partial->bh);
- partial--;
+ BUFFER_TRACE(partial2->bh, "call brelse");
+ brelse(partial2->bh);
+ return 0;
}
+
/*
- * Clear the ends of indirect blocks on the shared branch
- * at the end of the range
+ * The start and end partial branches may not be at the same
+ * level even though the punch happened within one level. So, we
+ * give them a chance to arrive at the same level, then walk
+ * them in step with each other until we converge on the same
+ * block.
*/
- if (partial2 > chain2) {
+ if (partial > chain && depth <= depth2) {
+ ext4_free_branches(handle, inode, partial->bh,
+ partial->p + 1,
+ (__le32 *)partial->bh->b_data+addr_per_block,
+ (chain+n-1) - partial);
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+ if (partial2 > chain2 && depth2 <= depth) {
ext4_free_branches(handle, inode, partial2->bh,
(__le32 *)partial2->bh->b_data,
partial2->p,
- (chain2+n-1) - partial2);
+ (chain2+n2-1) - partial2);
BUFFER_TRACE(partial2->bh, "call brelse");
brelse(partial2->bh);
partial2--;
}
}
+ return 0;
do_indirects:
/* Kill the remaining (whole) subtrees */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6325d2c..a3f4513 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1023,6 +1023,7 @@ static int ext4_write_end(struct file *file,
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
+ loff_t old_size = inode->i_size;
int ret = 0, ret2;
int i_size_changed = 0;
@@ -1053,6 +1054,8 @@ static int ext4_write_end(struct file *file,
unlock_page(page);
page_cache_release(page);
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
* makes the holding time of page lock longer. Second, it forces lock
@@ -1094,6 +1097,7 @@ static int ext4_journalled_write_end(struct file *file,
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
+ loff_t old_size = inode->i_size;
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
@@ -1126,6 +1130,9 @@ static int ext4_journalled_write_end(struct file *file,
unlock_page(page);
page_cache_release(page);
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
+
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1adac68..e061e66 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2779,6 +2779,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
if (readonly)
return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+ ext4_msg(sb, KERN_INFO, "filesystem is read-only");
+ sb->s_flags |= MS_RDONLY;
+ return 1;
+ }
+
/* Check that feature set is OK for a read-write mount */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
@@ -3936,9 +3942,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
- init_timer(&sbi->s_err_report);
- sbi->s_err_report.function = print_daily_error_info;
- sbi->s_err_report.data = (unsigned long) sb;
+ setup_timer(&sbi->s_err_report, print_daily_error_info,
+ (unsigned long) sb);
/* Register extent status tree shrinker */
if (ext4_es_register_shrinker(sbi))
@@ -4866,9 +4871,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
- /*
- * Allow the "check" option to be passed as a remount option.
- */
if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
err = -EINVAL;
goto restore_opts;
@@ -4877,17 +4879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
- "during remount not supported");
- err = -EINVAL;
- goto restore_opts;
- }
-
- if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
- test_opt(sb, JOURNAL_CHECKSUM)) {
- ext4_msg(sb, KERN_ERR, "changing journal_checksum "
- "during remount not supported");
- err = -EINVAL;
- goto restore_opts;
+ "during remount not supported; ignoring");
+ sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
}
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -4963,7 +4956,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_mark_recovery_complete(sb, es);
} else {
/* Make sure we can mount this feature set readwrite */
- if (!ext4_feature_set_ok(sb, 0)) {
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_READONLY) ||
+ !ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 073657f..e907052 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -769,9 +769,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
- if (!grab_super_passive(sb)) {
+ if (!trylock_super(sb)) {
/*
- * grab_super_passive() may fail consistently due to
+ * trylock_super() may fail consistently due to
* s_umount being grabbed by someone else. Don't use
* requeue_io() to avoid busy retrying the inode/sb.
*/
@@ -779,7 +779,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
continue;
}
wrote += writeback_sb_inodes(sb, wb, work);
- drop_super(sb);
+ up_read(&sb->s_umount);
/* refer to the same tests at the end of writeback_sb_inodes */
if (wrote) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8c92c72..95a2797 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -889,8 +889,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
newpage = buf->page;
- if (WARN_ON(!PageUptodate(newpage)))
- return -EIO;
+ if (!PageUptodate(newpage))
+ SetPageUptodate(newpage);
ClearPageMappedToDisk(newpage);
@@ -1352,6 +1352,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
return err;
}
+static int fuse_dev_open(struct inode *inode, struct file *file)
+{
+ /*
+ * The fuse device's file's private_data is used to hold
+ * the fuse_conn(ection) when it is mounted, and is used to
+ * keep track of whether the file has been mounted already.
+ */
+ file->private_data = NULL;
+ return 0;
+}
+
static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
@@ -1796,6 +1807,9 @@ copy_finish:
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
+ /* Don't try to move pages (yet) */
+ cs->move_pages = 0;
+
switch (code) {
case FUSE_NOTIFY_POLL:
return fuse_notify_poll(fc, size, cs);
@@ -2216,6 +2230,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
+ .open = fuse_dev_open,
.llseek = no_llseek,
.read = do_sync_read,
.aio_read = fuse_dev_read,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 08e7b1a..1545b71 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -971,7 +971,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
err = -EBUSY;
goto badentry;
}
- if (S_ISDIR(entry->d_inode->i_mode)) {
+ if (d_is_dir(entry)) {
shrink_dcache_parent(entry);
if (!simple_empty(entry)) {
err = -ENOTEMPTY;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 6371192..487527b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1809,7 +1809,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
gfs2_consist_inode(dip);
dip->i_entries--;
dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
- if (S_ISDIR(dentry->d_inode->i_mode))
+ if (d_is_dir(dentry))
drop_nlink(&dip->i_inode);
mark_inode_dirty(&dip->i_inode);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 435bea2..f0235c1 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -530,7 +530,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
- if (S_ISDIR(new_dentry->d_inode->i_mode))
+ if (d_is_dir(new_dentry))
res = hfsplus_rmdir(new_dir, new_dentry);
else
res = hfsplus_unlink(new_dir, new_dentry);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 5f27551..043ac9d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -678,10 +678,10 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
return NULL;
}
- if (S_ISDIR(dentry->d_inode->i_mode)) {
+ if (d_is_dir(dentry)) {
inode->i_op = &hppfs_dir_iops;
inode->i_fop = &hppfs_dir_fops;
- } else if (S_ISLNK(dentry->d_inode->i_mode)) {
+ } else if (d_is_symlink(dentry)) {
inode->i_op = &hppfs_link_iops;
inode->i_fop = &hppfs_file_fops;
} else {
diff --git a/fs/internal.h b/fs/internal.h
index 30459da..01dce1d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -84,7 +84,7 @@ extern struct file *get_empty_filp(void);
* super.c
*/
extern int do_remount_sb(struct super_block *, int, void *, int);
-extern bool grab_super_passive(struct super_block *sb);
+extern bool trylock_super(struct super_block *sb);
extern struct dentry *mount_fs(struct file_system_type *,
int, const char *, void *);
extern struct super_block *user_get_super(dev_t);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index bcbef08..b5128c6 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -524,6 +524,9 @@ static int do_one_pass(journal_t *journal,
if (descr_csum_size > 0 &&
!jbd2_descr_block_csum_verify(journal,
bh->b_data)) {
+ printk(KERN_ERR "JBD2: Invalid checksum "
+ "recovering block %lu in log\n",
+ next_log_block);
err = -EIO;
brelse(bh);
goto failed;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 92e0644b..556de10 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -84,11 +84,6 @@ static inline int pullbit(struct pushpull *pp)
return bit;
}
-static inline int pulledbits(struct pushpull *pp)
-{
- return pp->ofs;
-}
-
static void init_rubin(struct rubin_state *rs, int div, int *bits)
{
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9385560..f21b6fb 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -252,7 +252,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
if (!f->inocache)
return -EIO;
- if (S_ISDIR(old_dentry->d_inode->i_mode))
+ if (d_is_dir(old_dentry))
return -EPERM;
/* XXX: This is ugly */
@@ -772,7 +772,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
*/
if (new_dentry->d_inode) {
victim_f = JFFS2_INODE_INFO(new_dentry->d_inode);
- if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+ if (d_is_dir(new_dentry)) {
struct jffs2_full_dirent *fd;
mutex_lock(&victim_f->sem);
@@ -807,7 +807,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
if (victim_f) {
/* There was a victim. Kill it off nicely */
- if (S_ISDIR(new_dentry->d_inode->i_mode))
+ if (d_is_dir(new_dentry))
clear_nlink(new_dentry->d_inode);
else
drop_nlink(new_dentry->d_inode);
@@ -815,7 +815,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
inode which didn't exist. */
if (victim_f->inocache) {
mutex_lock(&victim_f->sem);
- if (S_ISDIR(new_dentry->d_inode->i_mode))
+ if (d_is_dir(new_dentry))
victim_f->inocache->pino_nlink = 0;
else
victim_f->inocache->pino_nlink--;
@@ -825,7 +825,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
/* If it was a directory we moved, and there was no victim,
increase i_nlink on its new parent */
- if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f)
+ if (d_is_dir(old_dentry) && !victim_f)
inc_nlink(new_dir_i);
/* Unlink the original */
@@ -839,7 +839,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
mutex_lock(&f->sem);
inc_nlink(old_dentry->d_inode);
- if (f->inocache && !S_ISDIR(old_dentry->d_inode->i_mode))
+ if (f->inocache && !d_is_dir(old_dentry))
f->inocache->pino_nlink++;
mutex_unlock(&f->sem);
@@ -852,7 +852,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
return ret;
}
- if (S_ISDIR(old_dentry->d_inode->i_mode))
+ if (d_is_dir(old_dentry))
drop_nlink(old_dir_i);
new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7654e87..9ad5ba4 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
sumlen = c->sector_size - je32_to_cpu(sm->offset);
sumptr = buf + buf_size - sumlen;
+ /* sm->offset maybe wrong but MAGIC maybe right */
+ if (sumlen > c->sector_size)
+ goto full_scan;
+
/* Now, make sure the summary itself is available */
if (sumlen > buf_size) {
/* Need to kmalloc for this. */
@@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
}
}
+full_scan:
buf_ofs = jeb->offset;
if (!buf_size) {
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0918f0e..3d76f28 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -138,7 +138,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
struct jffs2_inode_info *f;
uint32_t pino;
- BUG_ON(!S_ISDIR(child->d_inode->i_mode));
+ BUG_ON(!d_is_dir(child));
f = JFFS2_INODE_INFO(child->d_inode);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index b684e8a..2bacb99 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -207,6 +207,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
goto out_free;
}
+ of->event = atomic_read(&of->kn->attr.open->event);
ops = kernfs_ops(of->kn);
if (ops->read)
len = ops->read(of, buf, len, *ppos);
diff --git a/fs/libfs.c b/fs/libfs.c
index b2ffdb0..0ab6512 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -329,7 +329,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
struct inode *inode = old_dentry->d_inode;
- int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);
+ int they_are_dirs = d_is_dir(old_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
diff --git a/fs/locks.c b/fs/locks.c
index 4753218..528fedf 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -681,21 +681,18 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
}
static void
-locks_insert_lock_ctx(struct file_lock *fl, int *counter,
- struct list_head *before)
+locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
{
fl->fl_nspid = get_pid(task_tgid(current));
list_add_tail(&fl->fl_list, before);
- ++*counter;
locks_insert_global_locks(fl);
}
static void
-locks_unlink_lock_ctx(struct file_lock *fl, int *counter)
+locks_unlink_lock_ctx(struct file_lock *fl)
{
locks_delete_global_locks(fl);
list_del_init(&fl->fl_list);
- --*counter;
if (fl->fl_nspid) {
put_pid(fl->fl_nspid);
fl->fl_nspid = NULL;
@@ -704,10 +701,9 @@ locks_unlink_lock_ctx(struct file_lock *fl, int *counter)
}
static void
-locks_delete_lock_ctx(struct file_lock *fl, int *counter,
- struct list_head *dispose)
+locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
{
- locks_unlink_lock_ctx(fl, counter);
+ locks_unlink_lock_ctx(fl);
if (dispose)
list_add(&fl->fl_list, dispose);
else
@@ -895,7 +891,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
if (request->fl_type == fl->fl_type)
goto out;
found = true;
- locks_delete_lock_ctx(fl, &ctx->flc_flock_cnt, &dispose);
+ locks_delete_lock_ctx(fl, &dispose);
break;
}
@@ -905,16 +901,6 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
goto out;
}
- /*
- * If a higher-priority process was blocked on the old file lock,
- * give it the opportunity to lock the file.
- */
- if (found) {
- spin_unlock(&ctx->flc_lock);
- cond_resched();
- spin_lock(&ctx->flc_lock);
- }
-
find_conflict:
list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
if (!flock_locks_conflict(request, fl))
@@ -929,7 +915,7 @@ find_conflict:
if (request->fl_flags & FL_ACCESS)
goto out;
locks_copy_lock(new_fl, request);
- locks_insert_lock_ctx(new_fl, &ctx->flc_flock_cnt, &ctx->flc_flock);
+ locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
new_fl = NULL;
error = 0;
@@ -1046,8 +1032,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
else
request->fl_end = fl->fl_end;
if (added) {
- locks_delete_lock_ctx(fl, &ctx->flc_posix_cnt,
- &dispose);
+ locks_delete_lock_ctx(fl, &dispose);
continue;
}
request = fl;
@@ -1076,8 +1061,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
* one (This may happen several times).
*/
if (added) {
- locks_delete_lock_ctx(fl,
- &ctx->flc_posix_cnt, &dispose);
+ locks_delete_lock_ctx(fl, &dispose);
continue;
}
/*
@@ -1093,10 +1077,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
locks_copy_lock(new_fl, request);
request = new_fl;
new_fl = NULL;
- locks_insert_lock_ctx(request,
- &ctx->flc_posix_cnt, &fl->fl_list);
- locks_delete_lock_ctx(fl,
- &ctx->flc_posix_cnt, &dispose);
+ locks_insert_lock_ctx(request, &fl->fl_list);
+ locks_delete_lock_ctx(fl, &dispose);
added = true;
}
}
@@ -1124,8 +1106,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
goto out;
}
locks_copy_lock(new_fl, request);
- locks_insert_lock_ctx(new_fl, &ctx->flc_posix_cnt,
- &fl->fl_list);
+ locks_insert_lock_ctx(new_fl, &fl->fl_list);
+ fl = new_fl;
new_fl = NULL;
}
if (right) {
@@ -1136,8 +1118,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
left = new_fl2;
new_fl2 = NULL;
locks_copy_lock(left, right);
- locks_insert_lock_ctx(left, &ctx->flc_posix_cnt,
- &fl->fl_list);
+ locks_insert_lock_ctx(left, &fl->fl_list);
}
right->fl_start = request->fl_end + 1;
locks_wake_up_blocks(right);
@@ -1321,7 +1302,6 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
{
- struct file_lock_context *flctx;
int error = assign_type(fl, arg);
if (error)
@@ -1331,7 +1311,6 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
if (arg == F_UNLCK) {
struct file *filp = fl->fl_file;
- flctx = file_inode(filp)->i_flctx;
f_delown(filp);
filp->f_owner.signum = 0;
fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
@@ -1339,7 +1318,7 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
fl->fl_fasync = NULL;
}
- locks_delete_lock_ctx(fl, &flctx->flc_lease_cnt, dispose);
+ locks_delete_lock_ctx(fl, dispose);
}
return 0;
}
@@ -1456,8 +1435,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
fl->fl_downgrade_time = break_time;
}
if (fl->fl_lmops->lm_break(fl))
- locks_delete_lock_ctx(fl, &ctx->flc_lease_cnt,
- &dispose);
+ locks_delete_lock_ctx(fl, &dispose);
}
if (list_empty(&ctx->flc_lease))
@@ -1687,7 +1665,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
}
if (my_fl != NULL) {
- error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
+ lease = my_fl;
+ error = lease->fl_lmops->lm_change(lease, arg, &dispose);
if (error)
goto out;
goto out_setup;
@@ -1697,7 +1676,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
if (!leases_enable)
goto out;
- locks_insert_lock_ctx(lease, &ctx->flc_lease_cnt, &ctx->flc_lease);
+ locks_insert_lock_ctx(lease, &ctx->flc_lease);
/*
* The check in break_lease() is lockless. It's possible for another
* open to race in after we did the earlier check for a conflicting
@@ -1710,7 +1689,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
smp_mb();
error = check_conflicting_open(dentry, arg, lease->fl_flags);
if (error) {
- locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
+ locks_unlink_lock_ctx(lease);
goto out;
}
@@ -1749,7 +1728,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
break;
}
}
- trace_generic_delete_lease(inode, fl);
+ trace_generic_delete_lease(inode, victim);
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
@@ -2448,7 +2427,8 @@ locks_remove_lease(struct file *filp)
spin_lock(&ctx->flc_lock);
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
- lease_modify(fl, F_UNLCK, &dispose);
+ if (filp == fl->fl_file)
+ lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
locks_dispose_list(&dispose);
}
diff --git a/fs/namei.c b/fs/namei.c
index 96ca11d..76fb76a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -119,15 +119,14 @@
* PATH_MAX includes the nul terminator --RR.
*/
-#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
+#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
- struct filename *result, *err;
- int len;
- long max;
+ struct filename *result;
char *kname;
+ int len;
result = audit_reusename(filename);
if (result)
@@ -136,22 +135,18 @@ getname_flags(const char __user *filename, int flags, int *empty)
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
- result->refcnt = 1;
/*
* First, try to embed the struct filename inside the names_cache
* allocation
*/
- kname = (char *)result + sizeof(*result);
+ kname = (char *)result->iname;
result->name = kname;
- result->separate = false;
- max = EMBEDDED_NAME_MAX;
-recopy:
- len = strncpy_from_user(kname, filename, max);
+ len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
if (unlikely(len < 0)) {
- err = ERR_PTR(len);
- goto error;
+ __putname(result);
+ return ERR_PTR(len);
}
/*
@@ -160,43 +155,49 @@ recopy:
* names_cache allocation for the pathname, and re-do the copy from
* userland.
*/
- if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
+ if (unlikely(len == EMBEDDED_NAME_MAX)) {
+ const size_t size = offsetof(struct filename, iname[1]);
kname = (char *)result;
- result = kzalloc(sizeof(*result), GFP_KERNEL);
- if (!result) {
- err = ERR_PTR(-ENOMEM);
- result = (struct filename *)kname;
- goto error;
+ /*
+ * size is chosen that way we to guarantee that
+ * result->iname[0] is within the same object and that
+ * kname can't be equal to result->iname, no matter what.
+ */
+ result = kzalloc(size, GFP_KERNEL);
+ if (unlikely(!result)) {
+ __putname(kname);
+ return ERR_PTR(-ENOMEM);
}
result->name = kname;
- result->separate = true;
- result->refcnt = 1;
- max = PATH_MAX;
- goto recopy;
+ len = strncpy_from_user(kname, filename, PATH_MAX);
+ if (unlikely(len < 0)) {
+ __putname(kname);
+ kfree(result);
+ return ERR_PTR(len);
+ }
+ if (unlikely(len == PATH_MAX)) {
+ __putname(kname);
+ kfree(result);
+ return ERR_PTR(-ENAMETOOLONG);
+ }
}
+ result->refcnt = 1;
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
*empty = 1;
- err = ERR_PTR(-ENOENT);
- if (!(flags & LOOKUP_EMPTY))
- goto error;
+ if (!(flags & LOOKUP_EMPTY)) {
+ putname(result);
+ return ERR_PTR(-ENOENT);
+ }
}
- err = ERR_PTR(-ENAMETOOLONG);
- if (unlikely(len >= PATH_MAX))
- goto error;
-
result->uptr = filename;
result->aname = NULL;
audit_getname(result);
return result;
-
-error:
- putname(result);
- return err;
}
struct filename *
@@ -216,8 +217,7 @@ getname_kernel(const char * filename)
return ERR_PTR(-ENOMEM);
if (len <= EMBEDDED_NAME_MAX) {
- result->name = (char *)(result) + sizeof(*result);
- result->separate = false;
+ result->name = (char *)result->iname;
} else if (len <= PATH_MAX) {
struct filename *tmp;
@@ -227,7 +227,6 @@ getname_kernel(const char * filename)
return ERR_PTR(-ENOMEM);
}
tmp->name = (char *)result;
- tmp->separate = true;
result = tmp;
} else {
__putname(result);
@@ -249,7 +248,7 @@ void putname(struct filename *name)
if (--name->refcnt > 0)
return;
- if (name->separate) {
+ if (name->name != name->iname) {
__putname(name->name);
kfree(name);
} else
@@ -1851,10 +1850,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
return err;
}
-static int path_init(int dfd, const char *name, unsigned int flags,
+static int path_init(int dfd, const struct filename *name, unsigned int flags,
struct nameidata *nd)
{
int retval = 0;
+ const char *s = name->name;
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
@@ -1863,7 +1863,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
- if (*name) {
+ if (*s) {
if (!d_can_lookup(root))
return -ENOTDIR;
retval = inode_permission(inode, MAY_EXEC);
@@ -1885,7 +1885,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->root.mnt = NULL;
nd->m_seq = read_seqbegin(&mount_lock);
- if (*name=='/') {
+ if (*s == '/') {
if (flags & LOOKUP_RCU) {
rcu_read_lock();
nd->seq = set_root_rcu(nd);
@@ -1919,7 +1919,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
dentry = f.file->f_path.dentry;
- if (*name) {
+ if (*s) {
if (!d_can_lookup(dentry)) {
fdput(f);
return -ENOTDIR;
@@ -1949,7 +1949,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
return -ECHILD;
done:
current->total_link_count = 0;
- return link_path_walk(name, nd);
+ return link_path_walk(s, nd);
}
static void path_cleanup(struct nameidata *nd)
@@ -1972,7 +1972,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path)
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int path_lookupat(int dfd, const char *name,
+static int path_lookupat(int dfd, const struct filename *name,
unsigned int flags, struct nameidata *nd)
{
struct path path;
@@ -2027,31 +2027,17 @@ static int path_lookupat(int dfd, const char *name,
static int filename_lookup(int dfd, struct filename *name,
unsigned int flags, struct nameidata *nd)
{
- int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
+ int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
if (unlikely(retval == -ECHILD))
- retval = path_lookupat(dfd, name->name, flags, nd);
+ retval = path_lookupat(dfd, name, flags, nd);
if (unlikely(retval == -ESTALE))
- retval = path_lookupat(dfd, name->name,
- flags | LOOKUP_REVAL, nd);
+ retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
if (likely(!retval))
audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
return retval;
}
-static int do_path_lookup(int dfd, const char *name,
- unsigned int flags, struct nameidata *nd)
-{
- struct filename *filename = getname_kernel(name);
- int retval = PTR_ERR(filename);
-
- if (!IS_ERR(filename)) {
- retval = filename_lookup(dfd, filename, flags, nd);
- putname(filename);
- }
- return retval;
-}
-
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
{
@@ -2089,9 +2075,15 @@ out:
int kern_path(const char *name, unsigned int flags, struct path *path)
{
struct nameidata nd;
- int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
- if (!res)
- *path = nd.path;
+ struct filename *filename = getname_kernel(name);
+ int res = PTR_ERR(filename);
+
+ if (!IS_ERR(filename)) {
+ res = filename_lookup(AT_FDCWD, filename, flags, &nd);
+ putname(filename);
+ if (!res)
+ *path = nd.path;
+ }
return res;
}
EXPORT_SYMBOL(kern_path);
@@ -2108,15 +2100,22 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
const char *name, unsigned int flags,
struct path *path)
{
- struct nameidata nd;
- int err;
- nd.root.dentry = dentry;
- nd.root.mnt = mnt;
+ struct filename *filename = getname_kernel(name);
+ int err = PTR_ERR(filename);
+
BUG_ON(flags & LOOKUP_PARENT);
- /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
- err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
- if (!err)
- *path = nd.path;
+
+ /* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */
+ if (!IS_ERR(filename)) {
+ struct nameidata nd;
+ nd.root.dentry = dentry;
+ nd.root.mnt = mnt;
+ err = filename_lookup(AT_FDCWD, filename,
+ flags | LOOKUP_ROOT, &nd);
+ if (!err)
+ *path = nd.path;
+ putname(filename);
+ }
return err;
}
EXPORT_SYMBOL(vfs_path_lookup);
@@ -2138,9 +2137,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
* @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code. Also note that by using this function the
- * nameidata argument is passed to the filesystem methods and a filesystem
- * using this helper needs to be prepared for that.
+ * not be called by generic code.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2341,7 +2338,8 @@ out:
* Returns 0 and "path" will be valid on success; Returns error otherwise.
*/
static int
-path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
+path_mountpoint(int dfd, const struct filename *name, struct path *path,
+ unsigned int flags)
{
struct nameidata nd;
int err;
@@ -2370,20 +2368,20 @@ out:
}
static int
-filename_mountpoint(int dfd, struct filename *s, struct path *path,
+filename_mountpoint(int dfd, struct filename *name, struct path *path,
unsigned int flags)
{
int error;
- if (IS_ERR(s))
- return PTR_ERR(s);
- error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+ error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU);
if (unlikely(error == -ECHILD))
- error = path_mountpoint(dfd, s->name, path, flags);
+ error = path_mountpoint(dfd, name, path, flags);
if (unlikely(error == -ESTALE))
- error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
+ error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL);
if (likely(!error))
- audit_inode(s, path->dentry, 0);
- putname(s);
+ audit_inode(name, path->dentry, 0);
+ putname(name);
return error;
}
@@ -2814,7 +2812,7 @@ no_open:
} else if (!dentry->d_inode) {
goto out;
} else if ((open_flag & O_TRUNC) &&
- S_ISREG(dentry->d_inode->i_mode)) {
+ d_is_reg(dentry)) {
goto out;
}
/* will fail later, go on to get the right error */
@@ -3156,7 +3154,7 @@ static int do_tmpfile(int dfd, struct filename *pathname,
static const struct qstr name = QSTR_INIT("/", 1);
struct dentry *dentry, *child;
struct inode *dir;
- int error = path_lookupat(dfd, pathname->name,
+ int error = path_lookupat(dfd, pathname,
flags | LOOKUP_DIRECTORY, nd);
if (unlikely(error))
return error;
@@ -3229,7 +3227,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
goto out;
}
- error = path_init(dfd, pathname->name, flags, nd);
+ error = path_init(dfd, pathname, flags, nd);
if (unlikely(error))
goto out;
diff --git a/fs/namespace.c b/fs/namespace.c
index 72a286e..82ef140 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1907,8 +1907,8 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
return -EINVAL;
- if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
- S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
+ if (d_is_dir(mp->m_dentry) !=
+ d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
return attach_recursive_mnt(mnt, p, mp, NULL);
@@ -2180,8 +2180,8 @@ static int do_move_mount(struct path *path, const char *old_name)
if (!mnt_has_parent(old))
goto out1;
- if (S_ISDIR(path->dentry->d_inode->i_mode) !=
- S_ISDIR(old_path.dentry->d_inode->i_mode))
+ if (d_is_dir(path->dentry) !=
+ d_is_dir(old_path.dentry))
goto out1;
/*
* Don't move a mount residing in a shared parent.
@@ -2271,7 +2271,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
goto unlock;
err = -EINVAL;
- if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
+ if (d_is_symlink(newmnt->mnt.mnt_root))
goto unlock;
newmnt->mnt.mnt_flags = mnt_flags;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e36a9d7..197806f 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -427,6 +427,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
if (clp == NULL)
goto out;
+ if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+ goto out;
tbl = &clp->cl_session->bc_slot_table;
spin_lock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index f4ccfe6..19ca95c 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -313,7 +313,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
goto out;
}
- args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+ args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
status = htonl(NFS4ERR_DELAY);
goto out;
@@ -415,7 +415,7 @@ static __be32 decode_rc_list(struct xdr_stream *xdr,
rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
if (unlikely(p == NULL))
goto out;
- rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
+ rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls,
sizeof(*rc_list->rcl_refcalls),
GFP_KERNEL);
if (unlikely(rc_list->rcl_refcalls == NULL))
@@ -464,8 +464,10 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
for (i = 0; i < args->csa_nrclists; i++) {
status = decode_rc_list(xdr, &args->csa_rclists[i]);
- if (status)
+ if (status) {
+ args->csa_nrclists = i;
goto out_free;
+ }
}
}
status = 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f9f4845..1987415 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -433,7 +433,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
static bool nfs_client_init_is_complete(const struct nfs_client *clp)
{
- return clp->cl_cons_state != NFS_CS_INITING;
+ return clp->cl_cons_state <= NFS_CS_READY;
}
int nfs_wait_client_init_complete(const struct nfs_client *clp)
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index da54332..a6ad688 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -180,10 +180,9 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
delegation->cred = get_rpccred(cred);
clear_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags);
- NFS_I(inode)->delegation_state = delegation->type;
spin_unlock(&delegation->lock);
- put_rpccred(oldcred);
rcu_read_unlock();
+ put_rpccred(oldcred);
trace_nfs4_reclaim_delegation(inode, res->delegation_type);
} else {
/* We appear to have raced with a delegation return. */
@@ -275,7 +274,6 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
list_del_rcu(&delegation->super_list);
delegation->inode = NULL;
- nfsi->delegation_state = 0;
rcu_assign_pointer(nfsi->delegation, NULL);
spin_unlock(&delegation->lock);
return delegation;
@@ -355,7 +353,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
&delegation->stateid)) {
nfs_update_inplace_delegation(old_delegation,
delegation);
- nfsi->delegation_state = old_delegation->type;
goto out;
}
/*
@@ -373,13 +370,15 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
delegation = NULL;
goto out;
}
- freeme = nfs_detach_delegation_locked(nfsi,
+ if (test_and_set_bit(NFS_DELEGATION_RETURNING,
+ &old_delegation->flags))
+ goto out;
+ freeme = nfs_detach_delegation_locked(nfsi,
old_delegation, clp);
if (freeme == NULL)
goto out;
}
list_add_rcu(&delegation->super_list, &server->delegations);
- nfsi->delegation_state = delegation->type;
rcu_assign_pointer(nfsi->delegation, delegation);
delegation = NULL;
@@ -437,6 +436,8 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
{
bool ret = false;
+ if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ goto out;
if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
ret = true;
if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) {
@@ -448,6 +449,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
ret = true;
spin_unlock(&delegation->lock);
}
+out:
return ret;
}
@@ -475,14 +477,20 @@ restart:
super_list) {
if (!nfs_delegation_need_return(delegation))
continue;
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL)
+ if (!nfs_sb_active(server->super))
continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
+ rcu_read_unlock();
+ nfs_sb_deactive(server->super);
+ goto restart;
+ }
delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
err = nfs_end_delegation_return(inode, delegation, 0);
iput(inode);
+ nfs_sb_deactive(server->super);
if (!err)
goto restart;
set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
@@ -813,19 +821,30 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
+ if (test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags))
+ continue;
if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags) == 0)
continue;
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL)
+ if (!nfs_sb_active(server->super))
continue;
- delegation = nfs_detach_delegation(NFS_I(inode),
- delegation, server);
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
+ rcu_read_unlock();
+ nfs_sb_deactive(server->super);
+ goto restart;
+ }
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
-
- if (delegation != NULL)
- nfs_free_delegation(delegation);
+ if (delegation != NULL) {
+ delegation = nfs_detach_delegation(NFS_I(inode),
+ delegation, server);
+ if (delegation != NULL)
+ nfs_free_delegation(delegation);
+ }
iput(inode);
+ nfs_sb_deactive(server->super);
goto restart;
}
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9b0c55c..c19e16f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -408,14 +408,22 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc,
return 0;
}
+/* Match file and dirent using either filehandle or fileid
+ * Note: caller is responsible for checking the fsid
+ */
static
int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
{
+ struct nfs_inode *nfsi;
+
if (dentry->d_inode == NULL)
goto different;
- if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
- goto different;
- return 1;
+
+ nfsi = NFS_I(dentry->d_inode);
+ if (entry->fattr->fileid == nfsi->fileid)
+ return 1;
+ if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
+ return 1;
different:
return 0;
}
@@ -469,6 +477,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
struct inode *inode;
int status;
+ if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID))
+ return;
+ if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
+ return;
if (filename.name[0] == '.') {
if (filename.len == 1)
return;
@@ -479,6 +491,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
dentry = d_lookup(parent, &filename);
if (dentry != NULL) {
+ /* Is there a mountpoint here? If so, just exit */
+ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid,
+ &entry->fattr->fsid))
+ goto out;
if (nfs_same_file(dentry, entry)) {
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 5db3385..c3929fb 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -283,7 +283,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq)
{
- cinfo->lock = &dreq->lock;
+ cinfo->lock = &dreq->inode->i_lock;
cinfo->mds = &dreq->mds_cinfo;
cinfo->ds = &dreq->ds_cinfo;
cinfo->dreq = dreq;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5d8b89c..37b1558 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -177,7 +177,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
iocb->ki_filp,
iov_iter_count(to), (unsigned long) iocb->ki_pos);
- result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+ result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
if (!result) {
result = generic_file_read_iter(iocb, to);
if (result > 0)
@@ -198,7 +198,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
filp, (unsigned long) count, (unsigned long long) *ppos);
- res = nfs_revalidate_mapping(inode, filp->f_mapping);
+ res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
if (!res) {
res = generic_file_splice_read(filp, ppos, pipe, count, flags);
if (res > 0)
@@ -371,6 +371,10 @@ start:
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
return ret;
+ /*
+ * Wait for O_DIRECT to complete
+ */
+ nfs_inode_dio_wait(mapping->host);
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -618,6 +622,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
/* make sure the cache has finished storing the page */
nfs_fscache_wait_on_page_write(NFS_I(inode), page);
+ wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+
lock_page(page);
mapping = page_file_mapping(page);
if (mapping != inode->i_mapping)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7ae1c26..91e88a7 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -960,52 +960,19 @@ filelayout_mark_request_commit(struct nfs_page *req,
{
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
u32 i, j;
- struct list_head *list;
- struct pnfs_commit_bucket *buckets;
if (fl->commit_through_mds) {
- list = &cinfo->mds->list;
- spin_lock(cinfo->lock);
- goto mds_commit;
- }
-
- /* Note that we are calling nfs4_fl_calc_j_index on each page
- * that ends up being committed to a data server. An attractive
- * alternative is to add a field to nfs_write_data and nfs_page
- * to store the value calculated in filelayout_write_pagelist
- * and just use that here.
- */
- j = nfs4_fl_calc_j_index(lseg, req_offset(req));
- i = select_bucket_index(fl, j);
- spin_lock(cinfo->lock);
- buckets = cinfo->ds->buckets;
- list = &buckets[i].written;
- if (list_empty(list)) {
- /* Non-empty buckets hold a reference on the lseg. That ref
- * is normally transferred to the COMMIT call and released
- * there. It could also be released if the last req is pulled
- * off due to a rewrite, in which case it will be done in
- * pnfs_generic_clear_request_commit
+ nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+ } else {
+ /* Note that we are calling nfs4_fl_calc_j_index on each page
+ * that ends up being committed to a data server. An attractive
+ * alternative is to add a field to nfs_write_data and nfs_page
+ * to store the value calculated in filelayout_write_pagelist
+ * and just use that here.
*/
- buckets[i].wlseg = pnfs_get_lseg(lseg);
- }
- set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
- cinfo->ds->nwritten++;
-
-mds_commit:
- /* nfs_request_add_commit_list(). We need to add req to list without
- * dropping cinfo lock.
- */
- set_bit(PG_CLEAN, &(req)->wb_flags);
- nfs_list_add_request(req, list);
- cinfo->mds->ncommit++;
- spin_unlock(cinfo->lock);
- if (!cinfo->dreq) {
- inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
- BDI_RECLAIMABLE);
- __mark_inode_dirty(req->wb_context->dentry->d_inode,
- I_DIRTY_DATASYNC);
+ j = nfs4_fl_calc_j_index(lseg, req_offset(req));
+ i = select_bucket_index(fl, j);
+ pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
}
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index c22ecaa..315cc68 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1332,47 +1332,6 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
return PNFS_ATTEMPTED;
}
-static void
-ff_layout_mark_request_commit(struct nfs_page *req,
- struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- u32 ds_commit_idx)
-{
- struct list_head *list;
- struct pnfs_commit_bucket *buckets;
-
- spin_lock(cinfo->lock);
- buckets = cinfo->ds->buckets;
- list = &buckets[ds_commit_idx].written;
- if (list_empty(list)) {
- /* Non-empty buckets hold a reference on the lseg. That ref
- * is normally transferred to the COMMIT call and released
- * there. It could also be released if the last req is pulled
- * off due to a rewrite, in which case it will be done in
- * pnfs_common_clear_request_commit
- */
- WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
- buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
- }
- set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
- cinfo->ds->nwritten++;
-
- /* nfs_request_add_commit_list(). We need to add req to list without
- * dropping cinfo lock.
- */
- set_bit(PG_CLEAN, &(req)->wb_flags);
- nfs_list_add_request(req, list);
- cinfo->mds->ncommit++;
- spin_unlock(cinfo->lock);
- if (!cinfo->dreq) {
- inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
- BDI_RECLAIMABLE);
- __mark_inode_dirty(req->wb_context->dentry->d_inode,
- I_DIRTY_DATASYNC);
- }
-}
-
static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
{
return i;
@@ -1540,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.pg_write_ops = &ff_layout_pg_write_ops,
.get_ds_info = ff_layout_get_ds_info,
.free_deviceid_node = ff_layout_free_deveiceid_node,
- .mark_request_commit = ff_layout_mark_request_commit,
+ .mark_request_commit = pnfs_layout_mark_request_commit,
.clear_request_commit = pnfs_generic_clear_request_commit,
.scan_commit_lists = pnfs_generic_scan_commit_lists,
.recover_commit_reqs = pnfs_generic_recover_commit_reqs,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e4f0dce..d42dff6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -556,6 +556,7 @@ EXPORT_SYMBOL_GPL(nfs_setattr);
* This is a copy of the common vmtruncate, but with the locking
* corrected to take into account the fact that NFS requires
* inode->i_size to be updated under the inode->i_lock.
+ * Note: must be called with inode->i_lock held!
*/
static int nfs_vmtruncate(struct inode * inode, loff_t offset)
{
@@ -565,14 +566,14 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
if (err)
goto out;
- spin_lock(&inode->i_lock);
i_size_write(inode, offset);
/* Optimisation */
if (offset == 0)
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&inode->i_lock);
truncate_pagecache(inode, offset);
+ spin_lock(&inode->i_lock);
out:
return err;
}
@@ -585,10 +586,15 @@ out:
* Note: we do this in the *proc.c in order to ensure that
* it works for things like exclusive creates too.
*/
-void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
+ struct nfs_fattr *fattr)
{
+ /* Barrier: bump the attribute generation count. */
+ nfs_fattr_set_barrier(fattr);
+
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->attr_gencount = fattr->gencount;
if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
- spin_lock(&inode->i_lock);
if ((attr->ia_valid & ATTR_MODE) != 0) {
int mode = attr->ia_mode & S_IALLUGO;
mode |= inode->i_mode & ~S_IALLUGO;
@@ -600,12 +606,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
inode->i_gid = attr->ia_gid;
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL);
- spin_unlock(&inode->i_lock);
}
if ((attr->ia_valid & ATTR_SIZE) != 0) {
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
nfs_vmtruncate(inode, attr->ia_size);
}
+ nfs_update_inode(inode, fattr);
+ spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
@@ -1028,6 +1035,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
if (mapping->nrpages != 0) {
if (S_ISREG(inode->i_mode)) {
+ unmap_mapping_range(mapping, 0, 0, 0);
ret = nfs_sync_mapping(mapping);
if (ret < 0)
return ret;
@@ -1060,11 +1068,14 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
}
/**
- * nfs_revalidate_mapping - Revalidate the pagecache
+ * __nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
* @mapping - pointer to mapping
+ * @may_lock - take inode->i_mutex?
*/
-int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+static int __nfs_revalidate_mapping(struct inode *inode,
+ struct address_space *mapping,
+ bool may_lock)
{
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags;
@@ -1113,7 +1124,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
+ if (may_lock) {
+ mutex_lock(&inode->i_mutex);
+ ret = nfs_invalidate_mapping(inode, mapping);
+ mutex_unlock(&inode->i_mutex);
+ } else
+ ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@@ -1123,6 +1139,29 @@ out:
return ret;
}
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+ return __nfs_revalidate_mapping(inode, mapping, false);
+}
+
+/**
+ * nfs_revalidate_mapping_protected - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ *
+ * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
+ * while invalidating the mapping.
+ */
+int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
+{
+ return __nfs_revalidate_mapping(inode, mapping, true);
+}
+
static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -1231,13 +1270,6 @@ static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fat
return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
}
-static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
- if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
- return 0;
- return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
-}
-
static atomic_long_t nfs_attr_generation_counter;
static unsigned long nfs_read_attr_generation_counter(void)
@@ -1249,6 +1281,7 @@ unsigned long nfs_inc_attr_generation_counter(void)
{
return atomic_long_inc_return(&nfs_attr_generation_counter);
}
+EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter);
void nfs_fattr_init(struct nfs_fattr *fattr)
{
@@ -1260,6 +1293,22 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
}
EXPORT_SYMBOL_GPL(nfs_fattr_init);
+/**
+ * nfs_fattr_set_barrier
+ * @fattr: attributes
+ *
+ * Used to set a barrier after an attribute was updated. This
+ * barrier ensures that older attributes from RPC calls that may
+ * have raced with our update cannot clobber these new values.
+ * Note that you are still responsible for ensuring that other
+ * operations which change the attribute on the server do not
+ * collide.
+ */
+void nfs_fattr_set_barrier(struct nfs_fattr *fattr)
+{
+ fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
struct nfs_fattr *nfs_alloc_fattr(void)
{
struct nfs_fattr *fattr;
@@ -1370,7 +1419,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
nfs_ctime_need_update(inode, fattr) ||
- nfs_size_need_update(inode, fattr) ||
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
@@ -1460,6 +1508,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
int status;
spin_lock(&inode->i_lock);
+ nfs_fattr_set_barrier(fattr);
status = nfs_post_op_update_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
@@ -1468,7 +1517,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
/**
- * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
* @inode - pointer to inode
* @fattr - updated attributes
*
@@ -1478,11 +1527,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
*
* This function is mainly designed to be used by the ->write_done() functions.
*/
-int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr)
{
int status;
- spin_lock(&inode->i_lock);
/* Don't do a WCC update if these attributes are already stale */
if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
!nfs_inode_attrs_need_update(inode, fattr)) {
@@ -1514,6 +1562,27 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
}
out_noforce:
status = nfs_post_op_update_inode_locked(inode, fattr);
+ return status;
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ nfs_fattr_set_barrier(fattr);
+ status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
spin_unlock(&inode->i_lock);
return status;
}
@@ -1715,6 +1784,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
+ /* Set barrier to be more recent than all outstanding updates */
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
@@ -1722,6 +1792,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
}
+ /* Set the barrier to be more recent than this fattr */
+ if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+ nfsi->attr_gencount = fattr->gencount;
}
invalid &= ~NFS_INO_INVALID_ATTR;
/* Don't invalidate the data if we were to blame */
@@ -1775,7 +1848,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
#if IS_ENABLED(CONFIG_NFS_V4)
INIT_LIST_HEAD(&nfsi->open_states);
nfsi->delegation = NULL;
- nfsi->delegation_state = 0;
init_rwsem(&nfsi->rwsem);
nfsi->layout = NULL;
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 212b8c8..9e6475b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -459,6 +459,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
struct nfs_commit_info *cinfo,
u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
@@ -598,6 +599,19 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
}
/*
+ * Record the page as unstable and mark its inode as dirty.
+ */
+static inline
+void nfs_mark_page_unstable(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+
+ inc_zone_page_state(page, NR_UNSTABLE_NFS);
+ inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+}
+
+/*
* Determine the number of bytes of data the page contains
*/
static inline
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 78e557c..1f11d25 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -138,7 +138,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
- nfs_setattr_update_inode(inode, sattr);
+ nfs_setattr_update_inode(inode, sattr, fattr);
dprintk("NFS reply setattr: %d\n", status);
return status;
}
@@ -834,7 +834,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
if (task->tk_status >= 0)
- nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
+ nfs_writeback_update_inode(hdr);
return 0;
}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 2a932fd..53852a4 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1987,6 +1987,11 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+ if (entry->fattr->fileid != entry->ino) {
+ entry->fattr->mounted_on_fileid = entry->ino;
+ entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+ }
+
/* In fact, a post_op_fh3: */
p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8646af9..86d6214 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -621,6 +621,9 @@ int nfs41_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+ if (pos == new)
+ goto found;
+
if (pos->rpc_ops != new->rpc_ops)
continue;
@@ -639,10 +642,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
prev = pos;
status = nfs_wait_client_init_complete(pos);
- if (pos->cl_cons_state == NFS_CS_SESSION_INITING) {
- nfs4_schedule_lease_recovery(pos);
- status = nfs4_wait_clnt_recover(pos);
- }
spin_lock(&nn->nfs_client_lock);
if (status < 0)
break;
@@ -668,7 +667,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
*/
if (!nfs4_match_client_owner_id(pos, new))
continue;
-
+found:
atomic_inc(&pos->cl_count);
*result = pos;
status = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2e7c9f7..627f37c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -901,6 +901,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
if (!cinfo->atomic || cinfo->before != dir->i_version)
nfs_force_lookup_revalidate(dir);
dir->i_version = cinfo->after;
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
nfs_fscache_invalidate(dir);
spin_unlock(&dir->i_lock);
}
@@ -1552,6 +1553,9 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
+ opendata->o_arg.share_access = nfs4_map_atomic_open_share(
+ NFS_SB(opendata->dentry->d_sb),
+ fmode, 0);
memset(&opendata->o_res, 0, sizeof(opendata->o_res));
memset(&opendata->c_res, 0, sizeof(opendata->c_res));
nfs4_init_opendata_res(opendata);
@@ -2413,8 +2417,8 @@ static int _nfs4_do_open(struct inode *dir,
opendata->o_res.f_attr, sattr,
state, label, olabel);
if (status == 0) {
- nfs_setattr_update_inode(state->inode, sattr);
- nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ nfs_setattr_update_inode(state->inode, sattr,
+ opendata->o_res.f_attr);
nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
}
}
@@ -2651,7 +2655,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_EXPIRED:
if (!nfs4_stateid_match(&calldata->arg.stateid,
- &state->stateid)) {
+ &state->open_stateid)) {
rpc_restart_call_prepare(task);
goto out_release;
}
@@ -2687,7 +2691,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
- nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
+ nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid);
/* Calculate the change in open mode */
calldata->arg.fmode = 0;
if (state->n_rdwr == 0) {
@@ -3288,7 +3292,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
if (status == 0) {
- nfs_setattr_update_inode(inode, sattr);
+ nfs_setattr_update_inode(inode, sattr, fattr);
nfs_setsecurity(inode, fattr, label);
}
nfs4_label_free(label);
@@ -4234,7 +4238,7 @@ static int nfs4_write_done_cb(struct rpc_task *task,
}
if (task->tk_status >= 0) {
renew_lease(NFS_SERVER(inode), hdr->timestamp);
- nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
+ nfs_writeback_update_inode(hdr);
}
return 0;
}
@@ -6648,47 +6652,47 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
{
int status;
+ struct nfs41_bind_conn_to_session_args args = {
+ .client = clp,
+ .dir = NFS4_CDFC4_FORE_OR_BOTH,
+ };
struct nfs41_bind_conn_to_session_res res;
struct rpc_message msg = {
.rpc_proc =
&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
- .rpc_argp = clp,
+ .rpc_argp = &args,
.rpc_resp = &res,
.rpc_cred = cred,
};
dprintk("--> %s\n", __func__);
- res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
- if (unlikely(res.session == NULL)) {
- status = -ENOMEM;
- goto out;
- }
+ nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
+ if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+ args.dir = NFS4_CDFC4_FORE;
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
trace_nfs4_bind_conn_to_session(clp, status);
if (status == 0) {
- if (memcmp(res.session->sess_id.data,
+ if (memcmp(res.sessionid.data,
clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
dprintk("NFS: %s: Session ID mismatch\n", __func__);
status = -EIO;
- goto out_session;
+ goto out;
}
- if (res.dir != NFS4_CDFS4_BOTH) {
+ if ((res.dir & args.dir) != res.dir || res.dir == 0) {
dprintk("NFS: %s: Unexpected direction from server\n",
__func__);
status = -EIO;
- goto out_session;
+ goto out;
}
- if (res.use_conn_in_rdma_mode) {
+ if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
dprintk("NFS: %s: Server returned RDMA mode = true\n",
__func__);
status = -EIO;
- goto out_session;
+ goto out;
}
}
-out_session:
- kfree(res.session);
out:
dprintk("<-- %s status= %d\n", __func__, status);
return status;
@@ -6893,9 +6897,13 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
if (status == 0) {
clp->cl_clientid = res.clientid;
- clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R);
- if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R))
+ clp->cl_exchange_flags = res.flags;
+ /* Client ID is not confirmed */
+ if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+ clear_bit(NFS4_SESSION_ESTABLISHED,
+ &clp->cl_session->session_state);
clp->cl_seqid = res.seqid;
+ }
kfree(clp->cl_serverowner);
clp->cl_serverowner = res.server_owner;
@@ -7166,10 +7174,11 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->bc_attrs.max_reqs);
}
-static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
{
struct nfs4_channel_attrs *sent = &args->fc_attrs;
- struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+ struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
if (rcvd->max_resp_sz > sent->max_resp_sz)
return -EINVAL;
@@ -7188,11 +7197,14 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
return 0;
}
-static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
{
struct nfs4_channel_attrs *sent = &args->bc_attrs;
- struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
+ struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
+ if (!(res->flags & SESSION4_BACK_CHAN))
+ goto out;
if (rcvd->max_rqst_sz > sent->max_rqst_sz)
return -EINVAL;
if (rcvd->max_resp_sz < sent->max_resp_sz)
@@ -7204,18 +7216,33 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
return -EINVAL;
if (rcvd->max_reqs != sent->max_reqs)
return -EINVAL;
+out:
return 0;
}
static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
- struct nfs4_session *session)
+ struct nfs41_create_session_res *res)
{
int ret;
- ret = nfs4_verify_fore_channel_attrs(args, session);
+ ret = nfs4_verify_fore_channel_attrs(args, res);
if (ret)
return ret;
- return nfs4_verify_back_channel_attrs(args, session);
+ return nfs4_verify_back_channel_attrs(args, res);
+}
+
+static void nfs4_update_session(struct nfs4_session *session,
+ struct nfs41_create_session_res *res)
+{
+ nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+ /* Mark client id and session as being confirmed */
+ session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state);
+ session->flags = res->flags;
+ memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
+ if (res->flags & SESSION4_BACK_CHAN)
+ memcpy(&session->bc_attrs, &res->bc_attrs,
+ sizeof(session->bc_attrs));
}
static int _nfs4_proc_create_session(struct nfs_client *clp,
@@ -7224,11 +7251,12 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
struct nfs4_session *session = clp->cl_session;
struct nfs41_create_session_args args = {
.client = clp,
+ .clientid = clp->cl_clientid,
+ .seqid = clp->cl_seqid,
.cb_program = NFS4_CALLBACK,
};
- struct nfs41_create_session_res res = {
- .client = clp,
- };
+ struct nfs41_create_session_res res;
+
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
.rpc_argp = &args,
@@ -7245,11 +7273,15 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
if (!status) {
/* Verify the session's negotiated channel_attrs values */
- status = nfs4_verify_channel_attrs(&args, session);
+ status = nfs4_verify_channel_attrs(&args, &res);
/* Increment the clientid slot sequence id */
- clp->cl_seqid++;
+ if (clp->cl_seqid == res.seqid)
+ clp->cl_seqid++;
+ if (status)
+ goto out;
+ nfs4_update_session(session, &res);
}
-
+out:
return status;
}
@@ -7301,8 +7333,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
dprintk("--> nfs4_proc_destroy_session\n");
/* session is still being setup */
- if (session->clp->cl_cons_state != NFS_CS_READY)
- return status;
+ if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
+ return 0;
status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
trace_nfs4_destroy_session(session->clp, status);
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e799dc3..e23366e 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -450,7 +450,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
tbl = &ses->fc_slot_table;
tbl->session = ses;
status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
- if (status) /* -ENOMEM */
+ if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */
return status;
/* Back channel */
tbl = &ses->bc_slot_table;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index b34ada9..e3ea2c5 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -70,6 +70,7 @@ struct nfs4_session {
enum nfs4_session_state {
NFS4_SESSION_INITING,
+ NFS4_SESSION_ESTABLISHED,
};
extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
@@ -118,6 +119,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
return 0;
}
+static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
+ const struct nfs4_sessionid *src)
+{
+ memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
+}
+
#ifdef CONFIG_CRC32
/*
* nfs_session_id_hash - calculate the crc32 hash for the session id
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5ad908e..f95e3b5 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -346,9 +346,23 @@ int nfs41_discover_server_trunking(struct nfs_client *clp,
status = nfs4_proc_exchange_id(clp, cred);
if (status != NFS4_OK)
return status;
- set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
- return nfs41_walk_client_list(clp, result, cred);
+ status = nfs41_walk_client_list(clp, result, cred);
+ if (status < 0)
+ return status;
+ if (clp != *result)
+ return 0;
+
+ /* Purge state if the client id was established in a prior instance */
+ if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R)
+ set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+ else
+ set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+ status = nfs_wait_client_init_complete(clp);
+ if (status < 0)
+ nfs_put_client(clp);
+ return status;
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e23a0a6..5c399ec 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1715,17 +1715,17 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru
#if defined(CONFIG_NFS_V4_1)
/* NFSv4.1 operations */
static void encode_bind_conn_to_session(struct xdr_stream *xdr,
- struct nfs4_session *session,
+ struct nfs41_bind_conn_to_session_args *args,
struct compound_hdr *hdr)
{
__be32 *p;
encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
decode_bind_conn_to_session_maxsz, hdr);
- encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+ encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
p = xdr_reserve_space(xdr, 8);
- *p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH);
- *p = 0; /* use_conn_in_rdma_mode = False */
+ *p++ = cpu_to_be32(args->dir);
+ *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
}
static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
@@ -1806,8 +1806,8 @@ static void encode_create_session(struct xdr_stream *xdr,
encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
- p = xdr_encode_hyper(p, clp->cl_clientid);
- *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
+ p = xdr_encode_hyper(p, args->clientid);
+ *p++ = cpu_to_be32(args->seqid); /*Sequence id */
*p++ = cpu_to_be32(args->flags); /*flags */
/* Fore Channel */
@@ -2734,14 +2734,14 @@ static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
*/
static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
struct xdr_stream *xdr,
- struct nfs_client *clp)
+ struct nfs41_bind_conn_to_session_args *args)
{
struct compound_hdr hdr = {
- .minorversion = clp->cl_mvops->minor_version,
+ .minorversion = args->client->cl_mvops->minor_version,
};
encode_compound_hdr(xdr, req, &hdr);
- encode_bind_conn_to_session(xdr, clp->cl_session, &hdr);
+ encode_bind_conn_to_session(xdr, args, &hdr);
encode_nops(&hdr);
}
@@ -5613,7 +5613,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
if (!status)
- status = decode_sessionid(xdr, &res->session->sess_id);
+ status = decode_sessionid(xdr, &res->sessionid);
if (unlikely(status))
return status;
@@ -5641,12 +5641,10 @@ static int decode_create_session(struct xdr_stream *xdr,
{
__be32 *p;
int status;
- struct nfs_client *clp = res->client;
- struct nfs4_session *session = clp->cl_session;
status = decode_op_hdr(xdr, OP_CREATE_SESSION);
if (!status)
- status = decode_sessionid(xdr, &session->sess_id);
+ status = decode_sessionid(xdr, &res->sessionid);
if (unlikely(status))
return status;
@@ -5654,13 +5652,13 @@ static int decode_create_session(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
goto out_overflow;
- clp->cl_seqid = be32_to_cpup(p++);
- session->flags = be32_to_cpup(p);
+ res->seqid = be32_to_cpup(p++);
+ res->flags = be32_to_cpup(p);
/* Channel attributes */
- status = decode_chan_attrs(xdr, &session->fc_attrs);
+ status = decode_chan_attrs(xdr, &res->fc_attrs);
if (!status)
- status = decode_chan_attrs(xdr, &session->bc_attrs);
+ status = decode_chan_attrs(xdr, &res->bc_attrs);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 797cd62..635f086 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -344,6 +344,10 @@ void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
struct xdr_stream *xdr,
gfp_t gfp_flags);
+void pnfs_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
static inline bool nfs_have_layout(struct inode *inode)
{
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index fdc4f65..54e36b3 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -838,3 +838,33 @@ out_err:
return NULL;
}
EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
+
+void
+pnfs_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+{
+ struct list_head *list;
+ struct pnfs_commit_bucket *buckets;
+
+ spin_lock(cinfo->lock);
+ buckets = cinfo->ds->buckets;
+ list = &buckets[ds_commit_idx].written;
+ if (list_empty(list)) {
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * pnfs_common_clear_request_commit
+ */
+ WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+ buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+ }
+ set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ cinfo->ds->nwritten++;
+ spin_unlock(cinfo->lock);
+
+ nfs_request_add_commit_list(req, list, cinfo);
+}
+EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b09cc23..c63189a 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -139,7 +139,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
- nfs_setattr_update_inode(inode, sattr);
+ nfs_setattr_update_inode(inode, sattr, fattr);
dprintk("NFS reply setattr: %d\n", status);
return status;
}
@@ -609,10 +609,8 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = hdr->inode;
-
if (task->tk_status >= 0)
- nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
+ nfs_writeback_update_inode(hdr);
return 0;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 88a6d21..849ed78 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -789,13 +789,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
nfs_list_add_request(req, dst);
cinfo->mds->ncommit++;
spin_unlock(cinfo->lock);
- if (!cinfo->dreq) {
- inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
- BDI_RECLAIMABLE);
- __mark_inode_dirty(req->wb_context->dentry->d_inode,
- I_DIRTY_DATASYNC);
- }
+ if (!cinfo->dreq)
+ nfs_mark_page_unstable(req->wb_page);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -1382,6 +1377,36 @@ static int nfs_should_remove_suid(const struct inode *inode)
return 0;
}
+static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
+ struct nfs_fattr *fattr)
+{
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+ return;
+ if (argp->offset + resp->count != fattr->size)
+ return;
+ if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode))
+ return;
+ /* Set attribute barrier */
+ nfs_fattr_set_barrier(fattr);
+}
+
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
+{
+ struct nfs_fattr *fattr = hdr->res.fattr;
+ struct inode *inode = hdr->inode;
+
+ if (fattr == NULL)
+ return;
+ spin_lock(&inode->i_lock);
+ nfs_writeback_check_extend(hdr, fattr);
+ nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_writeback_update_inode);
+
/*
* This function is called when the WRITE call is complete.
*/
@@ -1605,11 +1630,8 @@ void nfs_retry_commit(struct list_head *page_list,
req = nfs_list_entry(page_list->next);
nfs_list_remove_request(req);
nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
- if (!cinfo->dreq) {
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
- BDI_RECLAIMABLE);
- }
+ if (!cinfo->dreq)
+ nfs_clear_page_commit(req->wb_page);
nfs_unlock_and_release_request(req);
}
}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 3c1bfa1..1028a06 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -587,8 +587,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
- nfsd4_cb_layout_fail(ls);
-
printk(KERN_WARNING
"nfsd: client %s failed to respond to layout recall. "
" Fencing..\n", addr_str);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index cc6a760..1c307f0 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -583,7 +583,7 @@ nfs4_reset_recoverydir(char *recdir)
if (status)
return status;
status = -ENOTDIR;
- if (S_ISDIR(path.dentry->d_inode->i_mode)) {
+ if (d_is_dir(path.dentry)) {
strcpy(user_recovery_dirname, recdir);
status = 0;
}
@@ -1426,7 +1426,7 @@ nfsd4_client_tracking_init(struct net *net)
nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
if (!status) {
- status = S_ISDIR(path.dentry->d_inode->i_mode);
+ status = d_is_dir(path.dentry);
path_put(&path);
if (status)
goto do_init;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f6b2a09..d2f2c37 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1638,7 +1638,7 @@ __destroy_client(struct nfs4_client *clp)
nfs4_put_stid(&dp->dl_stid);
}
while (!list_empty(&clp->cl_revoked)) {
- dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
+ dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru);
list_del_init(&dp->dl_recall_lru);
nfs4_put_stid(&dp->dl_stid);
}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 965b478..e9fa966 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -114,8 +114,8 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
* We're exposing only the directories and symlinks that have to be
* traversed on the way to real exports:
*/
- if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) &&
- !S_ISLNK(dentry->d_inode->i_mode)))
+ if (unlikely(!d_is_dir(dentry) &&
+ !d_is_symlink(dentry)))
return nfserr_stale;
/*
* A pseudoroot export gives permission to access only one
@@ -259,7 +259,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
goto out;
}
- if (S_ISDIR(dentry->d_inode->i_mode) &&
+ if (d_is_dir(dentry) &&
(dentry->d_flags & DCACHE_DISCONNECTED)) {
printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n",
dentry);
@@ -414,7 +414,7 @@ static inline void _fh_update_old(struct dentry *dentry,
{
fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino);
fh->ofh_generation = dentry->d_inode->i_generation;
- if (S_ISDIR(dentry->d_inode->i_mode) ||
+ if (d_is_dir(dentry) ||
(exp->ex_flags & NFSEXP_NOSUBTREECHECK))
fh->ofh_dirino = 0;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5685c67..3685265 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -615,9 +615,9 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
export = fhp->fh_export;
dentry = fhp->fh_dentry;
- if (S_ISREG(dentry->d_inode->i_mode))
+ if (d_is_reg(dentry))
map = nfs3_regaccess;
- else if (S_ISDIR(dentry->d_inode->i_mode))
+ else if (d_is_dir(dentry))
map = nfs3_diraccess;
else
map = nfs3_anyaccess;
@@ -1402,7 +1402,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
switch (createmode) {
case NFS3_CREATE_UNCHECKED:
- if (! S_ISREG(dchild->d_inode->i_mode))
+ if (! d_is_reg(dchild))
goto out;
else if (truncp) {
/* in nfsv4, we need to treat this case a little
@@ -1615,7 +1615,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
if (err)
goto out;
err = nfserr_isdir;
- if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode))
+ if (d_is_dir(tfhp->fh_dentry))
goto out;
err = nfserr_perm;
if (!len)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b2e3ff3..ecdbae1 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,6 +31,8 @@
#include "alloc.h"
#include "dat.h"
+static void __nilfs_btree_init(struct nilfs_bmap *bmap);
+
static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
{
struct nilfs_btree_path *path;
@@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
return ret;
}
+/**
+ * nilfs_btree_root_broken - verify consistency of btree root node
+ * @node: btree root node to be examined
+ * @ino: inode number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
+ unsigned long ino)
+{
+ int level, flags, nchildren;
+ int ret = 0;
+
+ level = nilfs_btree_node_get_level(node);
+ flags = nilfs_btree_node_get_flags(node);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+
+ if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+ level > NILFS_BTREE_LEVEL_MAX ||
+ nchildren < 0 ||
+ nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
+ pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
+ ino, level, flags, nchildren);
+ ret = 1;
+ }
+ return ret;
+}
+
int nilfs_btree_broken_node_block(struct buffer_head *bh)
{
int ret;
@@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
/* convert and insert */
dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
- nilfs_btree_init(btree);
+ __nilfs_btree_init(btree);
if (nreq != NULL) {
nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
@@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
.bop_gather_data = NULL,
};
-int nilfs_btree_init(struct nilfs_bmap *bmap)
+static void __nilfs_btree_init(struct nilfs_bmap *bmap)
{
bmap->b_ops = &nilfs_btree_ops;
bmap->b_nchildren_per_block =
NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
- return 0;
+}
+
+int nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+ int ret = 0;
+
+ __nilfs_btree_init(bmap);
+
+ if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
+ bmap->b_inode->i_ino))
+ ret = -EIO;
+ return ret;
}
void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 469086b..0c3f303 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
struct nilfs_inode_info *ii, *n;
+ int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
int defer_iput = false;
spin_lock(&nilfs->ns_inode_lock);
@@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
brelse(ii->i_bh);
ii->i_bh = NULL;
list_del_init(&ii->i_dirty);
- if (!ii->vfs_inode.i_nlink) {
+ if (!ii->vfs_inode.i_nlink || during_mount) {
/*
- * Defer calling iput() to avoid a deadlock
- * over I_SYNC flag for inodes with i_nlink == 0
+ * Defer calling iput() to avoid deadlocks if
+ * i_nlink == 0 or mount is not yet finished.
*/
list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
defer_iput = true;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 51ceb81..d2f97ec 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -115,8 +115,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
return false;
/* sorry, fanotify only gives a damn about files and dirs */
- if (!S_ISREG(path->dentry->d_inode->i_mode) &&
- !S_ISDIR(path->dentry->d_inode->i_mode))
+ if (!d_is_reg(path->dentry) &&
+ !d_can_lookup(path->dentry))
return false;
if (inode_mark && vfsmnt_mark) {
@@ -139,11 +139,12 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
BUG();
}
- if (S_ISDIR(path->dentry->d_inode->i_mode) &&
+ if (d_is_dir(path->dentry) &&
!(marks_mask & FS_ISDIR & ~marks_ignored_mask))
return false;
- if (event_mask & marks_mask & ~marks_ignored_mask)
+ if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
+ ~marks_ignored_mask)
return true;
return false;
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 36ae529..2ff263e 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-ccflags-y := -DNTFS_VERSION=\"2.1.31\"
+ccflags-y := -DNTFS_VERSION=\"2.1.32\"
ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f16f2d8..c1da78d 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
/*
* file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@@ -328,62 +328,168 @@ err_out:
return err;
}
-/**
- * ntfs_fault_in_pages_readable -
- *
- * Fault a number of userspace pages into pagetables.
- *
- * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
- * with more than two userspace pages as well as handling the single page case
- * elegantly.
- *
- * If you find this difficult to understand, then think of the while loop being
- * the following code, except that we do without the integer variable ret:
- *
- * do {
- * ret = __get_user(c, uaddr);
- * uaddr += PAGE_SIZE;
- * } while (!ret && uaddr < end);
- *
- * Note, the final __get_user() may well run out-of-bounds of the user buffer,
- * but _not_ out-of-bounds of the page the user buffer belongs to, and since
- * this is only a read and not a write, and since it is still in the same page,
- * it should not matter and this makes the code much simpler.
- */
-static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
- int bytes)
+static ssize_t ntfs_prepare_file_for_write(struct file *file, loff_t *ppos,
+ size_t *count)
{
- const char __user *end;
- volatile char c;
-
- /* Set @end to the first byte outside the last page we care about. */
- end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
-
- while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
- ;
-}
-
-/**
- * ntfs_fault_in_pages_readable_iovec -
- *
- * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
- */
-static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
- size_t iov_ofs, int bytes)
-{
- do {
- const char __user *buf;
- unsigned len;
+ loff_t pos;
+ s64 end, ll;
+ ssize_t err;
+ unsigned long flags;
+ struct inode *vi = file_inode(file);
+ ntfs_inode *base_ni, *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
- buf = iov->iov_base + iov_ofs;
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- ntfs_fault_in_pages_readable(buf, len);
- bytes -= len;
- iov++;
- iov_ofs = 0;
- } while (bytes);
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ "0x%llx, count 0x%lx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)*ppos, (unsigned long)*count);
+ /* We can write back this queue in page reclaim. */
+ current->backing_dev_info = inode_to_bdi(vi);
+ err = generic_write_checks(file, ppos, count, S_ISBLK(vi->i_mode));
+ if (unlikely(err))
+ goto out;
+ /*
+ * All checks have passed. Before we start doing any writing we want
+ * to abort any totally illegal writes.
+ */
+ BUG_ON(NInoMstProtected(ni));
+ BUG_ON(ni->type != AT_DATA);
+ /* If file is encrypted, deny access, just like NT4. */
+ if (NInoEncrypted(ni)) {
+ /* Only $DATA attributes can be encrypted. */
+ /*
+ * Reminder for later: Encrypted files are _always_
+ * non-resident so that the content can always be encrypted.
+ */
+ ntfs_debug("Denying write access to encrypted file.");
+ err = -EACCES;
+ goto out;
+ }
+ if (NInoCompressed(ni)) {
+ /* Only unnamed $DATA attribute can be compressed. */
+ BUG_ON(ni->name_len);
+ /*
+ * Reminder for later: If resident, the data is not actually
+ * compressed. Only on the switch to non-resident does
+ * compression kick in. This is in contrast to encrypted files
+ * (see above).
+ */
+ ntfs_error(vi->i_sb, "Writing to compressed files is not "
+ "implemented yet. Sorry.");
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ if (*count == 0)
+ goto out;
+ base_ni = ni;
+ if (NInoAttr(ni))
+ base_ni = ni->ext.base_ntfs_ino;
+ err = file_remove_suid(file);
+ if (unlikely(err))
+ goto out;
+ /*
+ * Our ->update_time method always succeeds thus file_update_time()
+ * cannot fail either so there is no need to check the return code.
+ */
+ file_update_time(file);
+ pos = *ppos;
+ /* The first byte after the last cluster being written to. */
+ end = (pos + *count + vol->cluster_size_mask) &
+ ~(u64)vol->cluster_size_mask;
+ /*
+ * If the write goes beyond the allocated size, extend the allocation
+ * to cover the whole of the write, rounded up to the nearest cluster.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (end > ll) {
+ /*
+ * Extend the allocation without changing the data size.
+ *
+ * Note we ensure the allocation is big enough to at least
+ * write some data but we do not require the allocation to be
+ * complete, i.e. it may be partial.
+ */
+ ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+ if (likely(ll >= 0)) {
+ BUG_ON(pos >= ll);
+ /* If the extension was partial truncate the write. */
+ if (end > ll) {
+ ntfs_debug("Truncating write to inode 0x%lx, "
+ "attribute type 0x%x, because "
+ "the allocation was only "
+ "partially extended.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ *count = ll - pos;
+ }
+ } else {
+ err = ll;
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /* Perform a partial write if possible or fail. */
+ if (pos < ll) {
+ ntfs_debug("Truncating write to inode 0x%lx "
+ "attribute type 0x%x, because "
+ "extending the allocation "
+ "failed (error %d).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type),
+ (int)-err);
+ *count = ll - pos;
+ } else {
+ if (err != -ENOSPC)
+ ntfs_error(vi->i_sb, "Cannot perform "
+ "write to inode "
+ "0x%lx, attribute "
+ "type 0x%x, because "
+ "extending the "
+ "allocation failed "
+ "(error %ld).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type),
+ (long)-err);
+ else
+ ntfs_debug("Cannot perform write to "
+ "inode 0x%lx, "
+ "attribute type 0x%x, "
+ "because there is not "
+ "space left.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ goto out;
+ }
+ }
+ }
+ /*
+ * If the write starts beyond the initialized size, extend it up to the
+ * beginning of the write and initialize all non-sparse space between
+ * the old initialized size and the new one. This automatically also
+ * increments the vfs inode->i_size to keep it above or equal to the
+ * initialized_size.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (pos > ll) {
+ /*
+ * Wait for ongoing direct i/o to complete before proceeding.
+ * New direct i/o cannot start as we hold i_mutex.
+ */
+ inode_dio_wait(vi);
+ err = ntfs_attr_extend_initialized(ni, pos);
+ if (unlikely(err < 0))
+ ntfs_error(vi->i_sb, "Cannot perform write to inode "
+ "0x%lx, attribute type 0x%x, because "
+ "extending the initialized size "
+ "failed (error %d).", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (int)-err);
+ }
+out:
+ return err;
}
/**
@@ -420,8 +526,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
goto err_out;
}
}
- err = add_to_page_cache_lru(*cached_page, mapping, index,
- GFP_KERNEL);
+ err = add_to_page_cache_lru(*cached_page, mapping,
+ index, GFP_KERNEL);
if (unlikely(err)) {
if (err == -EEXIST)
continue;
@@ -1267,180 +1373,6 @@ rl_not_mapped_enoent:
return err;
}
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied. If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static inline size_t ntfs_copy_from_user(struct page **pages,
- unsigned nr_pages, unsigned ofs, const char __user *buf,
- size_t bytes)
-{
- struct page **last_page = pages + nr_pages;
- char *addr;
- size_t total = 0;
- unsigned len;
- int left;
-
- do {
- len = PAGE_CACHE_SIZE - ofs;
- if (len > bytes)
- len = bytes;
- addr = kmap_atomic(*pages);
- left = __copy_from_user_inatomic(addr + ofs, buf, len);
- kunmap_atomic(addr);
- if (unlikely(left)) {
- /* Do it the slow way. */
- addr = kmap(*pages);
- left = __copy_from_user(addr + ofs, buf, len);
- kunmap(*pages);
- if (unlikely(left))
- goto err_out;
- }
- total += len;
- bytes -= len;
- if (!bytes)
- break;
- buf += len;
- ofs = 0;
- } while (++pages < last_page);
-out:
- return total;
-err_out:
- total += len - left;
- /* Zero the rest of the target like __copy_from_user(). */
- while (++pages < last_page) {
- bytes -= len;
- if (!bytes)
- break;
- len = PAGE_CACHE_SIZE;
- if (len > bytes)
- len = bytes;
- zero_user(*pages, 0, len);
- }
- goto out;
-}
-
-static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
- const struct iovec *iov, size_t iov_ofs, size_t bytes)
-{
- size_t total = 0;
-
- while (1) {
- const char __user *buf = iov->iov_base + iov_ofs;
- unsigned len;
- size_t left;
-
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- left = __copy_from_user_inatomic(vaddr, buf, len);
- total += len;
- bytes -= len;
- vaddr += len;
- if (unlikely(left)) {
- total -= left;
- break;
- }
- if (!bytes)
- break;
- iov++;
- iov_ofs = 0;
- }
- return total;
-}
-
-static inline void ntfs_set_next_iovec(const struct iovec **iovp,
- size_t *iov_ofsp, size_t bytes)
-{
- const struct iovec *iov = *iovp;
- size_t iov_ofs = *iov_ofsp;
-
- while (bytes) {
- unsigned len;
-
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- bytes -= len;
- iov_ofs += len;
- if (iov->iov_len == iov_ofs) {
- iov++;
- iov_ofs = 0;
- }
- }
- *iovp = iov;
- *iov_ofsp = iov_ofs;
-}
-
-/*
- * This has the same side-effects and return value as ntfs_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
- * single-segment behaviour.
- *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * atomic and when not atomic. This is ok because it calls
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
- * fact, the only difference between __copy_from_user_inatomic() and
- * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error. And on many architectures
- * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * makes no difference at all on those architectures.
- */
-static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
- unsigned nr_pages, unsigned ofs, const struct iovec **iov,
- size_t *iov_ofs, size_t bytes)
-{
- struct page **last_page = pages + nr_pages;
- char *addr;
- size_t copied, len, total = 0;
-
- do {
- len = PAGE_CACHE_SIZE - ofs;
- if (len > bytes)
- len = bytes;
- addr = kmap_atomic(*pages);
- copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
- *iov, *iov_ofs, len);
- kunmap_atomic(addr);
- if (unlikely(copied != len)) {
- /* Do it the slow way. */
- addr = kmap(*pages);
- copied = __ntfs_copy_from_user_iovec_inatomic(addr +
- ofs, *iov, *iov_ofs, len);
- if (unlikely(copied != len))
- goto err_out;
- kunmap(*pages);
- }
- total += len;
- ntfs_set_next_iovec(iov, iov_ofs, len);
- bytes -= len;
- if (!bytes)
- break;
- ofs = 0;
- } while (++pages < last_page);
-out:
- return total;
-err_out:
- BUG_ON(copied > len);
- /* Zero the rest of the target like __copy_from_user(). */
- memset(addr + ofs + copied, 0, len - copied);
- kunmap(*pages);
- total += copied;
- ntfs_set_next_iovec(iov, iov_ofs, copied);
- while (++pages < last_page) {
- bytes -= len;
- if (!bytes)
- break;
- len = PAGE_CACHE_SIZE;
- if (len > bytes)
- len = bytes;
- zero_user(*pages, 0, len);
- }
- goto out;
-}
-
static inline void ntfs_flush_dcache_pages(struct page **pages,
unsigned nr_pages)
{
@@ -1761,86 +1693,83 @@ err_out:
return err;
}
-static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied. If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
+ unsigned ofs, struct iov_iter *i, size_t bytes)
{
- struct inode *inode = mapping->host;
+ struct page **last_page = pages + nr_pages;
+ size_t total = 0;
+ struct iov_iter data = *i;
+ unsigned len, copied;
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- ntfs_truncate_vfs(inode);
- }
+ do {
+ len = PAGE_CACHE_SIZE - ofs;
+ if (len > bytes)
+ len = bytes;
+ copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
+ len);
+ total += copied;
+ bytes -= copied;
+ if (!bytes)
+ break;
+ iov_iter_advance(&data, copied);
+ if (copied < len)
+ goto err;
+ ofs = 0;
+ } while (++pages < last_page);
+out:
+ return total;
+err:
+ /* Zero the rest of the target like __copy_from_user(). */
+ len = PAGE_CACHE_SIZE - copied;
+ do {
+ if (len > bytes)
+ len = bytes;
+ zero_user(*pages, copied, len);
+ bytes -= len;
+ copied = 0;
+ len = PAGE_CACHE_SIZE;
+ } while (++pages < last_page);
+ goto out;
}
/**
- * ntfs_file_buffered_write -
- *
- * Locking: The vfs is holding ->i_mutex on the inode.
+ * ntfs_perform_write - perform buffered write to a file
+ * @file: file to write to
+ * @i: iov_iter with data to write
+ * @pos: byte offset in file at which to begin writing to
*/
-static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs,
- loff_t pos, loff_t *ppos, size_t count)
+static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
+ loff_t pos)
{
- struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *vi = mapping->host;
ntfs_inode *ni = NTFS_I(vi);
ntfs_volume *vol = ni->vol;
struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
struct page *cached_page = NULL;
- char __user *buf = NULL;
- s64 end, ll;
VCN last_vcn;
LCN lcn;
- unsigned long flags;
- size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
- ssize_t status, written;
+ size_t bytes;
+ ssize_t status, written = 0;
unsigned nr_pages;
- int err;
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
- "pos 0x%llx, count 0x%lx.",
- vi->i_ino, (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)pos, (unsigned long)count);
- if (unlikely(!count))
- return 0;
- BUG_ON(NInoMstProtected(ni));
- /*
- * If the attribute is not an index root and it is encrypted or
- * compressed, we cannot write to it yet. Note we need to check for
- * AT_INDEX_ALLOCATION since this is the type of both directory and
- * index inodes.
- */
- if (ni->type != AT_INDEX_ALLOCATION) {
- /* If file is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- /*
- * Reminder for later: Encrypted files are _always_
- * non-resident so that the content can always be
- * encrypted.
- */
- ntfs_debug("Denying write access to encrypted file.");
- return -EACCES;
- }
- if (NInoCompressed(ni)) {
- /* Only unnamed $DATA attribute can be compressed. */
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
- /*
- * Reminder for later: If resident, the data is not
- * actually compressed. Only on the switch to non-
- * resident does compression kick in. This is in
- * contrast to encrypted files (see above).
- */
- ntfs_error(vi->i_sb, "Writing to compressed files is "
- "not implemented yet. Sorry.");
- return -EOPNOTSUPP;
- }
- }
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ "0x%llx, count 0x%lx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)pos,
+ (unsigned long)iov_iter_count(i));
/*
* If a previous ntfs_truncate() failed, repeat it and abort if it
* fails again.
*/
if (unlikely(NInoTruncateFailed(ni))) {
+ int err;
+
inode_dio_wait(vi);
err = ntfs_truncate(vi);
if (err || NInoTruncateFailed(ni)) {
@@ -1854,81 +1783,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
return err;
}
}
- /* The first byte after the write. */
- end = pos + count;
- /*
- * If the write goes beyond the allocated size, extend the allocation
- * to cover the whole of the write, rounded up to the nearest cluster.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (end > ll) {
- /* Extend the allocation without changing the data size. */
- ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
- if (likely(ll >= 0)) {
- BUG_ON(pos >= ll);
- /* If the extension was partial truncate the write. */
- if (end > ll) {
- ntfs_debug("Truncating write to inode 0x%lx, "
- "attribute type 0x%x, because "
- "the allocation was only "
- "partially extended.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- end = ll;
- count = ll - pos;
- }
- } else {
- err = ll;
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /* Perform a partial write if possible or fail. */
- if (pos < ll) {
- ntfs_debug("Truncating write to inode 0x%lx, "
- "attribute type 0x%x, because "
- "extending the allocation "
- "failed (error code %i).",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type), err);
- end = ll;
- count = ll - pos;
- } else {
- ntfs_error(vol->sb, "Cannot perform write to "
- "inode 0x%lx, attribute type "
- "0x%x, because extending the "
- "allocation failed (error "
- "code %i).", vi->i_ino,
- (unsigned)
- le32_to_cpu(ni->type), err);
- return err;
- }
- }
- }
- written = 0;
- /*
- * If the write starts beyond the initialized size, extend it up to the
- * beginning of the write and initialize all non-sparse space between
- * the old initialized size and the new one. This automatically also
- * increments the vfs inode->i_size to keep it above or equal to the
- * initialized_size.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (pos > ll) {
- err = ntfs_attr_extend_initialized(ni, pos);
- if (err < 0) {
- ntfs_error(vol->sb, "Cannot perform write to inode "
- "0x%lx, attribute type 0x%x, because "
- "extending the initialized size "
- "failed (error code %i).", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- status = err;
- goto err_out;
- }
- }
/*
* Determine the number of pages per cluster for non-resident
* attributes.
@@ -1936,10 +1790,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
nr_pages = 1;
if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
- /* Finally, perform the actual write. */
last_vcn = -1;
- if (likely(nr_segs == 1))
- buf = iov->iov_base;
do {
VCN vcn;
pgoff_t idx, start_idx;
@@ -1964,10 +1815,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
vol->cluster_size_bits, false);
up_read(&ni->runlist.lock);
if (unlikely(lcn < LCN_HOLE)) {
- status = -EIO;
if (lcn == LCN_ENOMEM)
status = -ENOMEM;
- else
+ else {
+ status = -EIO;
ntfs_error(vol->sb, "Cannot "
"perform write to "
"inode 0x%lx, "
@@ -1976,6 +1827,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
"is corrupt.",
vi->i_ino, (unsigned)
le32_to_cpu(ni->type));
+ }
break;
}
if (lcn == LCN_HOLE) {
@@ -1988,8 +1840,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
}
}
}
- if (bytes > count)
- bytes = count;
+ if (bytes > iov_iter_count(i))
+ bytes = iov_iter_count(i);
+again:
/*
* Bring in the user page(s) that we will copy from _first_.
* Otherwise there is a nasty deadlock on copying from the same
@@ -1998,10 +1851,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
* pages being swapped out between us bringing them into memory
* and doing the actual copying.
*/
- if (likely(nr_segs == 1))
- ntfs_fault_in_pages_readable(buf, bytes);
- else
- ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+ if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
/* Get and lock @do_pages starting at index @start_idx. */
status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
pages, &cached_page);
@@ -2017,56 +1870,57 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
status = ntfs_prepare_pages_for_non_resident_write(
pages, do_pages, pos, bytes);
if (unlikely(status)) {
- loff_t i_size;
-
do {
unlock_page(pages[--do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
- /*
- * The write preparation may have instantiated
- * allocated space outside i_size. Trim this
- * off again. We can ignore any errors in this
- * case as we will just be waisting a bit of
- * allocated space, which is not a disaster.
- */
- i_size = i_size_read(vi);
- if (pos + bytes > i_size) {
- ntfs_write_failed(mapping, pos + bytes);
- }
break;
}
}
u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
- if (likely(nr_segs == 1)) {
- copied = ntfs_copy_from_user(pages + u, do_pages - u,
- ofs, buf, bytes);
- buf += copied;
- } else
- copied = ntfs_copy_from_user_iovec(pages + u,
- do_pages - u, ofs, &iov, &iov_ofs,
- bytes);
+ copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
+ i, bytes);
ntfs_flush_dcache_pages(pages + u, do_pages - u);
- status = ntfs_commit_pages_after_write(pages, do_pages, pos,
- bytes);
- if (likely(!status)) {
- written += copied;
- count -= copied;
- pos += copied;
- if (unlikely(copied != bytes))
- status = -EFAULT;
+ status = 0;
+ if (likely(copied == bytes)) {
+ status = ntfs_commit_pages_after_write(pages, do_pages,
+ pos, bytes);
+ if (!status)
+ status = bytes;
}
do {
unlock_page(pages[--do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
- if (unlikely(status))
+ if (unlikely(status < 0))
break;
- balance_dirty_pages_ratelimited(mapping);
+ copied = status;
cond_resched();
- } while (count);
-err_out:
- *ppos = pos;
+ if (unlikely(!copied)) {
+ size_t sc;
+
+ /*
+ * We failed to copy anything. Fall back to single
+ * segment length write.
+ *
+ * This is needed to avoid possible livelock in the
+ * case that all segments in the iov cannot be copied
+ * at once without a pagefault.
+ */
+ sc = iov_iter_single_seg_count(i);
+ if (bytes > sc)
+ bytes = sc;
+ goto again;
+ }
+ iov_iter_advance(i, copied);
+ pos += copied;
+ written += copied;
+ balance_dirty_pages_ratelimited(mapping);
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
+ } while (iov_iter_count(i));
if (cached_page)
page_cache_release(cached_page);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
@@ -2076,59 +1930,56 @@ err_out:
}
/**
- * ntfs_file_aio_write_nolock -
+ * ntfs_file_write_iter_nolock - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ *
+ * Basically the same as __generic_file_write_iter() except that it ends
+ * up calling ntfs_perform_write() instead of generic_perform_write() and that
+ * O_DIRECT is not implemented.
*/
-static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+static ssize_t ntfs_file_write_iter_nolock(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos;
- size_t count; /* after file limit checks */
- ssize_t written, err;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
+ ssize_t err;
+ size_t count = iov_iter_count(from);
- count = iov_length(iov, nr_segs);
- pos = *ppos;
- /* We can write back this queue in page reclaim. */
- current->backing_dev_info = inode_to_bdi(inode);
- written = 0;
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
- goto out;
- if (!count)
- goto out;
- err = file_remove_suid(file);
- if (err)
- goto out;
- err = file_update_time(file);
- if (err)
- goto out;
- written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
- count);
-out:
+ err = ntfs_prepare_file_for_write(file, &pos, &count);
+ if (count && !err) {
+ iov_iter_truncate(from, count);
+ written = ntfs_perform_write(file, from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ }
current->backing_dev_info = NULL;
return written ? written : err;
}
/**
- * ntfs_file_aio_write -
+ * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Basically the same as generic_file_write_iter() except that it ends up
+ * calling ntfs_file_write_iter_nolock() instead of
+ * __generic_file_write_iter().
*/
-static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *vi = file_inode(file);
ssize_t ret;
- BUG_ON(iocb->ki_pos != pos);
-
- mutex_lock(&inode->i_mutex);
- ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
- mutex_unlock(&inode->i_mutex);
+ mutex_lock(&vi->i_mutex);
+ ret = ntfs_file_write_iter_nolock(iocb, from);
+ mutex_unlock(&vi->i_mutex);
if (ret > 0) {
- int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ ssize_t err;
+
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
@@ -2196,37 +2047,17 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
#endif /* NTFS_RW */
const struct file_operations ntfs_file_ops = {
- .llseek = generic_file_llseek, /* Seek inside file. */
- .read = new_sync_read, /* Read from file. */
- .read_iter = generic_file_read_iter, /* Async read from file. */
+ .llseek = generic_file_llseek,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
#ifdef NTFS_RW
- .write = do_sync_write, /* Write to file. */
- .aio_write = ntfs_file_aio_write, /* Async write to file. */
- /*.release = ,*/ /* Last file is closed. See
- fs/ext2/file.c::
- ext2_release_file() for
- how to use this to discard
- preallocated space for
- write opened files. */
- .fsync = ntfs_file_fsync, /* Sync a file to disk. */
- /*.aio_fsync = ,*/ /* Sync all outstanding async
- i/o operations on a
- kiocb. */
+ .write = new_sync_write,
+ .write_iter = ntfs_file_write_iter,
+ .fsync = ntfs_file_fsync,
#endif /* NTFS_RW */
- /*.ioctl = ,*/ /* Perform function on the
- mounted filesystem. */
- .mmap = generic_file_mmap, /* Mmap file. */
- .open = ntfs_file_open, /* Open file. */
- .splice_read = generic_file_splice_read /* Zero-copy data send with
- the data source being on
- the ntfs partition. We do
- not need to care about the
- data destination. */
- /*.sendpage = ,*/ /* Zero-copy data send with
- the data destination being
- on the ntfs partition. We
- do not need to care about
- the data source. */
+ .mmap = generic_file_mmap,
+ .open = ntfs_file_open,
+ .splice_read = generic_file_splice_read,
};
const struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8490c64..460c6c3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -502,7 +502,7 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
{
- if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
return 1;
return 0;
}
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 20e37a3..db64ce2 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -102,11 +102,11 @@
| OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
| OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
| OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
- | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
+ | OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
- | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
- | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
/*
* Heartbeat-only devices are missing journals and other files. The
@@ -179,6 +179,11 @@
#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000
+
+/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
*/
@@ -200,10 +205,6 @@
#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
-/*
- * Append Direct IO support
- */
-#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008
/* The byte offset of the first backup block will be 1G.
* The following will be 4G, 16G, 64G, 256G and 1T.
diff --git a/fs/open.c b/fs/open.c
index 33f9cbf..6a83c47 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -570,6 +570,7 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
uid = make_kuid(current_user_ns(), user);
gid = make_kgid(current_user_ns(), group);
+retry_deleg:
newattrs.ia_valid = ATTR_CTIME;
if (user != (uid_t) -1) {
if (!uid_valid(uid))
@@ -586,7 +587,6 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
-retry_deleg:
mutex_lock(&inode->i_mutex);
error = security_path_chown(path, uid, gid);
if (!error)
@@ -988,9 +988,6 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
return ERR_PTR(err);
if (flags & O_CREAT)
return ERR_PTR(-EINVAL);
- if (!filename && (flags & O_DIRECTORY))
- if (!dentry->d_inode->i_op->lookup)
- return ERR_PTR(-ENOTDIR);
return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index ea10a87..24f6404 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -191,7 +191,6 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
ovl_set_timestamps(upperdentry, stat);
return err;
-
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
@@ -385,7 +384,7 @@ int ovl_copy_up(struct dentry *dentry)
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
- if (type != OVL_PATH_LOWER)
+ if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
@@ -394,7 +393,7 @@ int ovl_copy_up(struct dentry *dentry)
parent = dget_parent(next);
type = ovl_path_type(parent);
- if (type != OVL_PATH_LOWER)
+ if (OVL_TYPE_UPPER(type))
break;
dput(next);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 8ffc4b9..d139405 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -19,7 +19,7 @@ void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
int err;
dget(wdentry);
- if (S_ISDIR(wdentry->d_inode->i_mode))
+ if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
@@ -118,14 +118,14 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
static int ovl_set_opaque(struct dentry *upperdentry)
{
- return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
+ return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
- err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr);
+ err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
@@ -152,7 +152,7 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
- if (type == OVL_PATH_MERGE)
+ if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
@@ -506,7 +506,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
struct dentry *opaquedir = NULL;
int err;
- if (is_dir) {
+ if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
@@ -630,7 +630,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
goto out_drop_write;
type = ovl_path_type(dentry);
- if (type == OVL_PATH_PURE_UPPER) {
+ if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
@@ -693,7 +693,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
- bool is_dir = S_ISDIR(old->d_inode->i_mode);
+ bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
@@ -712,7 +712,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
- if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir)
+ if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
@@ -720,30 +720,30 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (err)
goto out;
- if (S_ISDIR(new->d_inode->i_mode))
+ if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
- if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir)
+ if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
- if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
+ if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
- if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
+ if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
- new_type = OVL_PATH_UPPER;
+ new_type = __OVL_PATH_UPPER;
else
- new_type = OVL_PATH_PURE_UPPER;
+ new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
@@ -763,8 +763,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
goto out_drop_write;
}
- old_opaque = old_type != OVL_PATH_PURE_UPPER;
- new_opaque = new_type != OVL_PATH_PURE_UPPER;
+ old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
+ new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
@@ -787,7 +787,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
old_cred = override_creds(override_cred);
}
- if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
+ if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 07d74b2..04f1248 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -205,7 +205,7 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
static bool ovl_is_private_xattr(const char *name)
{
- return strncmp(name, "trusted.overlay.", 14) == 0;
+ return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
@@ -238,7 +238,10 @@ out:
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
- return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode);
+ if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
+ return S_ISDIR(dentry->d_inode->i_mode);
+ else
+ return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
@@ -299,7 +302,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name)
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
- if (type == OVL_PATH_LOWER) {
+ if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
@@ -321,7 +324,7 @@ out:
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
- if (type != OVL_PATH_LOWER)
+ if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
@@ -430,5 +433,4 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
}
return inode;
-
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 814bed3..17ac5af 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -12,13 +12,20 @@
struct ovl_entry;
enum ovl_path_type {
- OVL_PATH_PURE_UPPER,
- OVL_PATH_UPPER,
- OVL_PATH_MERGE,
- OVL_PATH_LOWER,
+ __OVL_PATH_PURE = (1 << 0),
+ __OVL_PATH_UPPER = (1 << 1),
+ __OVL_PATH_MERGE = (1 << 2),
};
-extern const char *ovl_opaque_xattr;
+#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
+#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
+#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
+#define OVL_TYPE_MERGE_OR_LOWER(type) \
+ (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
+
+#define OVL_XATTR_PRE_NAME "trusted.overlay."
+#define OVL_XATTR_PRE_LEN 16
+#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
@@ -130,6 +137,7 @@ void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index c020599..907870e 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -24,7 +24,6 @@ struct ovl_cache_entry {
struct list_head l_node;
struct rb_node node;
bool is_whiteout;
- bool is_cursor;
char name[];
};
@@ -40,6 +39,7 @@ struct ovl_readdir_data {
struct rb_root root;
struct list_head *list;
struct list_head middle;
+ struct dentry *dir;
int count;
int err;
};
@@ -48,7 +48,7 @@ struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
- struct ovl_cache_entry cursor;
+ struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
@@ -79,23 +79,49 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
return NULL;
}
-static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
+static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
+ const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
- if (p) {
- memcpy(p->name, name, len);
- p->name[len] = '\0';
- p->len = len;
- p->type = d_type;
- p->ino = ino;
- p->is_whiteout = false;
- p->is_cursor = false;
- }
+ if (!p)
+ return NULL;
+
+ memcpy(p->name, name, len);
+ p->name[len] = '\0';
+ p->len = len;
+ p->type = d_type;
+ p->ino = ino;
+ p->is_whiteout = false;
+
+ if (d_type == DT_CHR) {
+ struct dentry *dentry;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ override_cred = prepare_creds();
+ if (!override_cred) {
+ kfree(p);
+ return NULL;
+ }
+
+ /*
+ * CAP_DAC_OVERRIDE for lookup
+ */
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ old_cred = override_creds(override_cred);
+ dentry = lookup_one_len(name, dir, len);
+ if (!IS_ERR(dentry)) {
+ p->is_whiteout = ovl_is_whiteout(dentry);
+ dput(dentry);
+ }
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
return p;
}
@@ -122,7 +148,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
return 0;
}
- p = ovl_cache_entry_new(name, len, ino, d_type);
+ p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
@@ -143,7 +169,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd,
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
- p = ovl_cache_entry_new(name, namelen, ino, d_type);
+ p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
@@ -168,7 +194,6 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
- list_del_init(&od->cursor.l_node);
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
@@ -204,6 +229,7 @@ static inline int ovl_dir_read(struct path *realpath,
if (IS_ERR(realfile))
return PTR_ERR(realfile);
+ rdd->dir = realpath->dentry;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
@@ -227,108 +253,58 @@ static void ovl_dir_reset(struct file *file)
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
+ od->cursor = NULL;
}
- WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
- if (od->is_real && type == OVL_PATH_MERGE)
+ WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
+ if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
-static int ovl_dir_mark_whiteouts(struct dentry *dir,
- struct ovl_readdir_data *rdd)
-{
- struct ovl_cache_entry *p;
- struct dentry *dentry;
- const struct cred *old_cred;
- struct cred *override_cred;
-
- override_cred = prepare_creds();
- if (!override_cred) {
- ovl_cache_free(rdd->list);
- return -ENOMEM;
- }
-
- /*
- * CAP_DAC_OVERRIDE for lookup
- */
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- old_cred = override_creds(override_cred);
-
- mutex_lock(&dir->d_inode->i_mutex);
- list_for_each_entry(p, rdd->list, l_node) {
- if (p->is_cursor)
- continue;
-
- if (p->type != DT_CHR)
- continue;
-
- dentry = lookup_one_len(p->name, dir, p->len);
- if (IS_ERR(dentry))
- continue;
-
- p->is_whiteout = ovl_is_whiteout(dentry);
- dput(dentry);
- }
- mutex_unlock(&dir->d_inode->i_mutex);
-
- revert_creds(old_cred);
- put_cred(override_cred);
-
- return 0;
-}
-
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
- struct path lowerpath;
- struct path upperpath;
+ struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
+ int idx, next;
- ovl_path_lower(dentry, &lowerpath);
- ovl_path_upper(dentry, &upperpath);
+ for (idx = 0; idx != -1; idx = next) {
+ next = ovl_path_next(idx, dentry, &realpath);
- if (upperpath.dentry) {
- err = ovl_dir_read(&upperpath, &rdd);
- if (err)
- goto out;
-
- if (lowerpath.dentry) {
- err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd);
+ if (next != -1) {
+ err = ovl_dir_read(&realpath, &rdd);
if (err)
- goto out;
+ break;
+ } else {
+ /*
+ * Insert lowest layer entries before upper ones, this
+ * allows offsets to be reasonably constant
+ */
+ list_add(&rdd.middle, rdd.list);
+ rdd.is_merge = true;
+ err = ovl_dir_read(&realpath, &rdd);
+ list_del(&rdd.middle);
}
}
- if (lowerpath.dentry) {
- /*
- * Insert lowerpath entries before upperpath ones, this allows
- * offsets to be reasonably constant
- */
- list_add(&rdd.middle, rdd.list);
- rdd.is_merge = true;
- err = ovl_dir_read(&lowerpath, &rdd);
- list_del(&rdd.middle);
- }
-out:
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
- struct ovl_cache_entry *p;
+ struct list_head *p;
loff_t off = 0;
- list_for_each_entry(p, &od->cache->entries, l_node) {
- if (p->is_cursor)
- continue;
+ list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
- list_move_tail(&od->cursor.l_node, &p->l_node);
+ /* Cursor is safe since the cache is stable */
+ od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
@@ -367,6 +343,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
+ struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
@@ -385,19 +362,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
ovl_seek_cursor(od, ctx->pos);
}
- while (od->cursor.l_node.next != &od->cache->entries) {
- struct ovl_cache_entry *p;
-
- p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
- /* Skip cursors */
- if (!p->is_cursor) {
- if (!p->is_whiteout) {
- if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
- break;
- }
- ctx->pos++;
- }
- list_move(&od->cursor.l_node, &p->l_node);
+ while (od->cursor != &od->cache->entries) {
+ p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
+ if (!p->is_whiteout)
+ if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+ break;
+ od->cursor = p->l_node.next;
+ ctx->pos++;
}
return 0;
}
@@ -452,7 +423,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
/*
* Need to check if we started out being a lower dir, but got copied up
*/
- if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) {
+ if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
@@ -516,11 +487,9 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
kfree(od);
return PTR_ERR(realfile);
}
- INIT_LIST_HEAD(&od->cursor.l_node);
od->realfile = realfile;
- od->is_real = (type != OVL_PATH_MERGE);
- od->is_upper = (type != OVL_PATH_LOWER);
- od->cursor.is_cursor = true;
+ od->is_real = !OVL_TYPE_MERGE(type);
+ od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index f16d318..5f0d199 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -35,7 +35,8 @@ struct ovl_config {
/* private information held for overlayfs's superblock */
struct ovl_fs {
struct vfsmount *upper_mnt;
- struct vfsmount *lower_mnt;
+ unsigned numlower;
+ struct vfsmount **lower_mnt;
struct dentry *workdir;
long lower_namelen;
/* pathnames of lower and upper dirs, for show_options */
@@ -47,7 +48,6 @@ struct ovl_dir_cache;
/* private information held for every overlayfs dentry */
struct ovl_entry {
struct dentry *__upperdentry;
- struct dentry *lowerdentry;
struct ovl_dir_cache *cache;
union {
struct {
@@ -56,30 +56,36 @@ struct ovl_entry {
};
struct rcu_head rcu;
};
+ unsigned numlower;
+ struct path lowerstack[];
};
-const char *ovl_opaque_xattr = "trusted.overlay.opaque";
+#define OVL_MAX_STACK 500
+static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe)
+{
+ return oe->numlower ? oe->lowerstack[0].dentry : NULL;
+}
enum ovl_path_type ovl_path_type(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
+ enum ovl_path_type type = 0;
if (oe->__upperdentry) {
- if (oe->lowerdentry) {
+ type = __OVL_PATH_UPPER;
+
+ if (oe->numlower) {
if (S_ISDIR(dentry->d_inode->i_mode))
- return OVL_PATH_MERGE;
- else
- return OVL_PATH_UPPER;
- } else {
- if (oe->opaque)
- return OVL_PATH_UPPER;
- else
- return OVL_PATH_PURE_UPPER;
+ type |= __OVL_PATH_MERGE;
+ } else if (!oe->opaque) {
+ type |= __OVL_PATH_PURE;
}
} else {
- return OVL_PATH_LOWER;
+ if (oe->numlower > 1)
+ type |= __OVL_PATH_MERGE;
}
+ return type;
}
static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
@@ -98,10 +104,9 @@ void ovl_path_upper(struct dentry *dentry, struct path *path)
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
{
-
enum ovl_path_type type = ovl_path_type(dentry);
- if (type == OVL_PATH_LOWER)
+ if (!OVL_TYPE_UPPER(type))
ovl_path_lower(dentry, path);
else
ovl_path_upper(dentry, path);
@@ -120,7 +125,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
- return oe->lowerdentry;
+ return __ovl_dentry_lower(oe);
}
struct dentry *ovl_dentry_real(struct dentry *dentry)
@@ -130,7 +135,7 @@ struct dentry *ovl_dentry_real(struct dentry *dentry)
realdentry = ovl_upperdentry_dereference(oe);
if (!realdentry)
- realdentry = oe->lowerdentry;
+ realdentry = __ovl_dentry_lower(oe);
return realdentry;
}
@@ -143,7 +148,7 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
if (realdentry) {
*is_upper = true;
} else {
- realdentry = oe->lowerdentry;
+ realdentry = __ovl_dentry_lower(oe);
*is_upper = false;
}
return realdentry;
@@ -165,11 +170,9 @@ void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
void ovl_path_lower(struct dentry *dentry, struct path *path)
{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
struct ovl_entry *oe = dentry->d_fsdata;
- path->mnt = ofs->lower_mnt;
- path->dentry = oe->lowerdentry;
+ *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
}
int ovl_want_write(struct dentry *dentry)
@@ -249,7 +252,7 @@ static bool ovl_is_opaquedir(struct dentry *dentry)
if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
return false;
- res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1);
+ res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
if (res == 1 && val == 'y')
return true;
@@ -261,8 +264,11 @@ static void ovl_dentry_release(struct dentry *dentry)
struct ovl_entry *oe = dentry->d_fsdata;
if (oe) {
+ unsigned int i;
+
dput(oe->__upperdentry);
- dput(oe->lowerdentry);
+ for (i = 0; i < oe->numlower; i++)
+ dput(oe->lowerstack[i].dentry);
kfree_rcu(oe, rcu);
}
}
@@ -271,9 +277,15 @@ static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
};
-static struct ovl_entry *ovl_alloc_entry(void)
+static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
{
- return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
+ size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
+ struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+
+ if (oe)
+ oe->numlower = numlower;
+
+ return oe;
}
static inline struct dentry *ovl_lookup_real(struct dentry *dir,
@@ -295,82 +307,154 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
return dentry;
}
+/*
+ * Returns next layer in stack starting from top.
+ * Returns -1 if this is the last layer.
+ */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ BUG_ON(idx < 0);
+ if (idx == 0) {
+ ovl_path_upper(dentry, path);
+ if (path->dentry)
+ return oe->numlower ? 1 : -1;
+ idx++;
+ }
+ BUG_ON(idx > oe->numlower);
+ *path = oe->lowerstack[idx - 1];
+
+ return (idx < oe->numlower) ? idx + 1 : -1;
+}
+
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct ovl_entry *oe;
- struct dentry *upperdir;
- struct dentry *lowerdir;
- struct dentry *upperdentry = NULL;
- struct dentry *lowerdentry = NULL;
+ struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ struct path *stack = NULL;
+ struct dentry *upperdir, *upperdentry = NULL;
+ unsigned int ctr = 0;
struct inode *inode = NULL;
+ bool upperopaque = false;
+ struct dentry *this, *prev = NULL;
+ unsigned int i;
int err;
- err = -ENOMEM;
- oe = ovl_alloc_entry();
- if (!oe)
- goto out;
-
- upperdir = ovl_dentry_upper(dentry->d_parent);
- lowerdir = ovl_dentry_lower(dentry->d_parent);
-
+ upperdir = ovl_upperdentry_dereference(poe);
if (upperdir) {
- upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
- err = PTR_ERR(upperdentry);
- if (IS_ERR(upperdentry))
- goto out_put_dir;
-
- if (lowerdir && upperdentry) {
- if (ovl_is_whiteout(upperdentry)) {
- dput(upperdentry);
- upperdentry = NULL;
- oe->opaque = true;
- } else if (ovl_is_opaquedir(upperdentry)) {
- oe->opaque = true;
+ this = ovl_lookup_real(upperdir, &dentry->d_name);
+ err = PTR_ERR(this);
+ if (IS_ERR(this))
+ goto out;
+
+ if (this) {
+ if (ovl_is_whiteout(this)) {
+ dput(this);
+ this = NULL;
+ upperopaque = true;
+ } else if (poe->numlower && ovl_is_opaquedir(this)) {
+ upperopaque = true;
}
}
+ upperdentry = prev = this;
}
- if (lowerdir && !oe->opaque) {
- lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
- err = PTR_ERR(lowerdentry);
- if (IS_ERR(lowerdentry))
- goto out_dput_upper;
+
+ if (!upperopaque && poe->numlower) {
+ err = -ENOMEM;
+ stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
+ if (!stack)
+ goto out_put_upper;
}
- if (lowerdentry && upperdentry &&
- (!S_ISDIR(upperdentry->d_inode->i_mode) ||
- !S_ISDIR(lowerdentry->d_inode->i_mode))) {
- dput(lowerdentry);
- lowerdentry = NULL;
- oe->opaque = true;
+ for (i = 0; !upperopaque && i < poe->numlower; i++) {
+ bool opaque = false;
+ struct path lowerpath = poe->lowerstack[i];
+
+ this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
+ err = PTR_ERR(this);
+ if (IS_ERR(this)) {
+ /*
+ * If it's positive, then treat ENAMETOOLONG as ENOENT.
+ */
+ if (err == -ENAMETOOLONG && (upperdentry || ctr))
+ continue;
+ goto out_put;
+ }
+ if (!this)
+ continue;
+ if (ovl_is_whiteout(this)) {
+ dput(this);
+ break;
+ }
+ /*
+ * Only makes sense to check opaque dir if this is not the
+ * lowermost layer.
+ */
+ if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
+ opaque = true;
+
+ if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
+ !S_ISDIR(this->d_inode->i_mode))) {
+ /*
+ * FIXME: check for upper-opaqueness maybe better done
+ * in remove code.
+ */
+ if (prev == upperdentry)
+ upperopaque = true;
+ dput(this);
+ break;
+ }
+ /*
+ * If this is a non-directory then stop here.
+ */
+ if (!S_ISDIR(this->d_inode->i_mode))
+ opaque = true;
+
+ stack[ctr].dentry = this;
+ stack[ctr].mnt = lowerpath.mnt;
+ ctr++;
+ prev = this;
+ if (opaque)
+ break;
}
- if (lowerdentry || upperdentry) {
+ oe = ovl_alloc_entry(ctr);
+ err = -ENOMEM;
+ if (!oe)
+ goto out_put;
+
+ if (upperdentry || ctr) {
struct dentry *realdentry;
- realdentry = upperdentry ? upperdentry : lowerdentry;
+ realdentry = upperdentry ? upperdentry : stack[0].dentry;
+
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
oe);
if (!inode)
- goto out_dput;
+ goto out_free_oe;
ovl_copyattr(realdentry->d_inode, inode);
}
+ oe->opaque = upperopaque;
oe->__upperdentry = upperdentry;
- oe->lowerdentry = lowerdentry;
-
+ memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+ kfree(stack);
dentry->d_fsdata = oe;
d_add(dentry, inode);
return NULL;
-out_dput:
- dput(lowerdentry);
-out_dput_upper:
- dput(upperdentry);
-out_put_dir:
+out_free_oe:
kfree(oe);
+out_put:
+ for (i = 0; i < ctr; i++)
+ dput(stack[i].dentry);
+ kfree(stack);
+out_put_upper:
+ dput(upperdentry);
out:
return ERR_PTR(err);
}
@@ -383,10 +467,12 @@ struct file *ovl_path_open(struct path *path, int flags)
static void ovl_put_super(struct super_block *sb)
{
struct ovl_fs *ufs = sb->s_fs_info;
+ unsigned i;
dput(ufs->workdir);
mntput(ufs->upper_mnt);
- mntput(ufs->lower_mnt);
+ for (i = 0; i < ufs->numlower; i++)
+ mntput(ufs->lower_mnt[i]);
kfree(ufs->config.lowerdir);
kfree(ufs->config.upperdir);
@@ -400,7 +486,7 @@ static void ovl_put_super(struct super_block *sb)
* @buf: The struct kstatfs to fill in with stats
*
* Get the filesystem statistics. As writes always target the upper layer
- * filesystem pass the statfs to the same filesystem.
+ * filesystem pass the statfs to the upper filesystem (if it exists)
*/
static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
{
@@ -409,7 +495,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
struct path path;
int err;
- ovl_path_upper(root_dentry, &path);
+ ovl_path_real(root_dentry, &path);
err = vfs_statfs(&path, buf);
if (!err) {
@@ -432,8 +518,20 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
struct ovl_fs *ufs = sb->s_fs_info;
seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
- seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
- seq_printf(m, ",workdir=%s", ufs->config.workdir);
+ if (ufs->config.upperdir) {
+ seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+ seq_printf(m, ",workdir=%s", ufs->config.workdir);
+ }
+ return 0;
+}
+
+static int ovl_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct ovl_fs *ufs = sb->s_fs_info;
+
+ if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
+ return -EROFS;
+
return 0;
}
@@ -441,6 +539,7 @@ static const struct super_operations ovl_super_operations = {
.put_super = ovl_put_super,
.statfs = ovl_statfs,
.show_options = ovl_show_options,
+ .remount_fs = ovl_remount,
};
enum {
@@ -515,9 +614,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
break;
default:
+ pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
return -EINVAL;
}
}
+
+ /* Workdir is useless in non-upper mount */
+ if (!config->upperdir && config->workdir) {
+ pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+ config->workdir);
+ kfree(config->workdir);
+ config->workdir = NULL;
+ }
+
return 0;
}
@@ -585,24 +694,6 @@ static void ovl_unescape(char *s)
}
}
-static int ovl_mount_dir(const char *name, struct path *path)
-{
- int err;
- char *tmp = kstrdup(name, GFP_KERNEL);
-
- if (!tmp)
- return -ENOMEM;
-
- ovl_unescape(tmp);
- err = kern_path(tmp, LOOKUP_FOLLOW, path);
- if (err) {
- pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err);
- err = -EINVAL;
- }
- kfree(tmp);
- return err;
-}
-
static bool ovl_is_allowed_fs_type(struct dentry *root)
{
const struct dentry_operations *dop = root->d_op;
@@ -622,6 +713,75 @@ static bool ovl_is_allowed_fs_type(struct dentry *root)
return true;
}
+static int ovl_mount_dir_noesc(const char *name, struct path *path)
+{
+ int err = -EINVAL;
+
+ if (!*name) {
+ pr_err("overlayfs: empty lowerdir\n");
+ goto out;
+ }
+ err = kern_path(name, LOOKUP_FOLLOW, path);
+ if (err) {
+ pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
+ goto out;
+ }
+ err = -EINVAL;
+ if (!ovl_is_allowed_fs_type(path->dentry)) {
+ pr_err("overlayfs: filesystem on '%s' not supported\n", name);
+ goto out_put;
+ }
+ if (!S_ISDIR(path->dentry->d_inode->i_mode)) {
+ pr_err("overlayfs: '%s' not a directory\n", name);
+ goto out_put;
+ }
+ return 0;
+
+out_put:
+ path_put(path);
+out:
+ return err;
+}
+
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+ int err = -ENOMEM;
+ char *tmp = kstrdup(name, GFP_KERNEL);
+
+ if (tmp) {
+ ovl_unescape(tmp);
+ err = ovl_mount_dir_noesc(tmp, path);
+ kfree(tmp);
+ }
+ return err;
+}
+
+static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
+ int *stack_depth)
+{
+ int err;
+ struct kstatfs statfs;
+
+ err = ovl_mount_dir_noesc(name, path);
+ if (err)
+ goto out;
+
+ err = vfs_statfs(path, &statfs);
+ if (err) {
+ pr_err("overlayfs: statfs failed on '%s'\n", name);
+ goto out_put;
+ }
+ *namelen = max(*namelen, statfs.f_namelen);
+ *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
+
+ return 0;
+
+out_put:
+ path_put(path);
+out:
+ return err;
+}
+
/* Workdir should not be subdir of upperdir and vice versa */
static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
{
@@ -634,16 +794,39 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
return ok;
}
+static unsigned int ovl_split_lowerdirs(char *str)
+{
+ unsigned int ctr = 1;
+ char *s, *d;
+
+ for (s = d = str;; s++, d++) {
+ if (*s == '\\') {
+ s++;
+ } else if (*s == ':') {
+ *d = '\0';
+ ctr++;
+ continue;
+ }
+ *d = *s;
+ if (!*s)
+ break;
+ }
+ return ctr;
+}
+
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
- struct path lowerpath;
- struct path upperpath;
- struct path workpath;
- struct inode *root_inode;
+ struct path upperpath = { NULL, NULL };
+ struct path workpath = { NULL, NULL };
struct dentry *root_dentry;
struct ovl_entry *oe;
struct ovl_fs *ufs;
- struct kstatfs statfs;
+ struct path *stack = NULL;
+ char *lowertmp;
+ char *lower;
+ unsigned int numlower;
+ unsigned int stacklen = 0;
+ unsigned int i;
int err;
err = -ENOMEM;
@@ -655,123 +838,147 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto out_free_config;
- /* FIXME: workdir is not needed for a R/O mount */
err = -EINVAL;
- if (!ufs->config.upperdir || !ufs->config.lowerdir ||
- !ufs->config.workdir) {
- pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
+ if (!ufs->config.lowerdir) {
+ pr_err("overlayfs: missing 'lowerdir'\n");
goto out_free_config;
}
- err = -ENOMEM;
- oe = ovl_alloc_entry();
- if (oe == NULL)
- goto out_free_config;
-
- err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
- if (err)
- goto out_free_oe;
+ sb->s_stack_depth = 0;
+ if (ufs->config.upperdir) {
+ if (!ufs->config.workdir) {
+ pr_err("overlayfs: missing 'workdir'\n");
+ goto out_free_config;
+ }
- err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath);
- if (err)
- goto out_put_upperpath;
+ err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
+ if (err)
+ goto out_free_config;
- err = ovl_mount_dir(ufs->config.workdir, &workpath);
- if (err)
- goto out_put_lowerpath;
+ /* Upper fs should not be r/o */
+ if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) {
+ pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+ err = -EINVAL;
+ goto out_put_upperpath;
+ }
- err = -EINVAL;
- if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
- !S_ISDIR(lowerpath.dentry->d_inode->i_mode) ||
- !S_ISDIR(workpath.dentry->d_inode->i_mode)) {
- pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n");
- goto out_put_workpath;
- }
+ err = ovl_mount_dir(ufs->config.workdir, &workpath);
+ if (err)
+ goto out_put_upperpath;
- if (upperpath.mnt != workpath.mnt) {
- pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
- goto out_put_workpath;
+ err = -EINVAL;
+ if (upperpath.mnt != workpath.mnt) {
+ pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+ goto out_put_workpath;
+ }
+ if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
+ pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+ goto out_put_workpath;
+ }
+ sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth;
}
- if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
- pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+ err = -ENOMEM;
+ lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL);
+ if (!lowertmp)
goto out_put_workpath;
- }
- if (!ovl_is_allowed_fs_type(upperpath.dentry)) {
- pr_err("overlayfs: filesystem of upperdir is not supported\n");
- goto out_put_workpath;
+ err = -EINVAL;
+ stacklen = ovl_split_lowerdirs(lowertmp);
+ if (stacklen > OVL_MAX_STACK) {
+ pr_err("overlayfs: too many lower directries, limit is %d\n",
+ OVL_MAX_STACK);
+ goto out_free_lowertmp;
+ } else if (!ufs->config.upperdir && stacklen == 1) {
+ pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
+ goto out_free_lowertmp;
}
- if (!ovl_is_allowed_fs_type(lowerpath.dentry)) {
- pr_err("overlayfs: filesystem of lowerdir is not supported\n");
- goto out_put_workpath;
- }
+ stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
+ if (!stack)
+ goto out_free_lowertmp;
- err = vfs_statfs(&lowerpath, &statfs);
- if (err) {
- pr_err("overlayfs: statfs failed on lowerpath\n");
- goto out_put_workpath;
- }
- ufs->lower_namelen = statfs.f_namelen;
+ lower = lowertmp;
+ for (numlower = 0; numlower < stacklen; numlower++) {
+ err = ovl_lower_dir(lower, &stack[numlower],
+ &ufs->lower_namelen, &sb->s_stack_depth);
+ if (err)
+ goto out_put_lowerpath;
- sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
- lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
+ lower = strchr(lower, '\0') + 1;
+ }
err = -EINVAL;
+ sb->s_stack_depth++;
if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
pr_err("overlayfs: maximum fs stacking depth exceeded\n");
- goto out_put_workpath;
+ goto out_put_lowerpath;
}
- ufs->upper_mnt = clone_private_mount(&upperpath);
- err = PTR_ERR(ufs->upper_mnt);
- if (IS_ERR(ufs->upper_mnt)) {
- pr_err("overlayfs: failed to clone upperpath\n");
- goto out_put_workpath;
- }
+ if (ufs->config.upperdir) {
+ ufs->upper_mnt = clone_private_mount(&upperpath);
+ err = PTR_ERR(ufs->upper_mnt);
+ if (IS_ERR(ufs->upper_mnt)) {
+ pr_err("overlayfs: failed to clone upperpath\n");
+ goto out_put_lowerpath;
+ }
- ufs->lower_mnt = clone_private_mount(&lowerpath);
- err = PTR_ERR(ufs->lower_mnt);
- if (IS_ERR(ufs->lower_mnt)) {
- pr_err("overlayfs: failed to clone lowerpath\n");
- goto out_put_upper_mnt;
+ ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
+ err = PTR_ERR(ufs->workdir);
+ if (IS_ERR(ufs->workdir)) {
+ pr_err("overlayfs: failed to create directory %s/%s\n",
+ ufs->config.workdir, OVL_WORKDIR_NAME);
+ goto out_put_upper_mnt;
+ }
}
- ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
- err = PTR_ERR(ufs->workdir);
- if (IS_ERR(ufs->workdir)) {
- pr_err("overlayfs: failed to create directory %s/%s\n",
- ufs->config.workdir, OVL_WORKDIR_NAME);
- goto out_put_lower_mnt;
- }
+ err = -ENOMEM;
+ ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL);
+ if (ufs->lower_mnt == NULL)
+ goto out_put_workdir;
+ for (i = 0; i < numlower; i++) {
+ struct vfsmount *mnt = clone_private_mount(&stack[i]);
- /*
- * Make lower_mnt R/O. That way fchmod/fchown on lower file
- * will fail instead of modifying lower fs.
- */
- ufs->lower_mnt->mnt_flags |= MNT_READONLY;
+ err = PTR_ERR(mnt);
+ if (IS_ERR(mnt)) {
+ pr_err("overlayfs: failed to clone lowerpath\n");
+ goto out_put_lower_mnt;
+ }
+ /*
+ * Make lower_mnt R/O. That way fchmod/fchown on lower file
+ * will fail instead of modifying lower fs.
+ */
+ mnt->mnt_flags |= MNT_READONLY;
+
+ ufs->lower_mnt[ufs->numlower] = mnt;
+ ufs->numlower++;
+ }
- /* If the upper fs is r/o, we mark overlayfs r/o too */
- if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
+ /* If the upper fs is nonexistent, we mark overlayfs r/o too */
+ if (!ufs->upper_mnt)
sb->s_flags |= MS_RDONLY;
sb->s_d_op = &ovl_dentry_operations;
err = -ENOMEM;
- root_inode = ovl_new_inode(sb, S_IFDIR, oe);
- if (!root_inode)
- goto out_put_workdir;
+ oe = ovl_alloc_entry(numlower);
+ if (!oe)
+ goto out_put_lower_mnt;
- root_dentry = d_make_root(root_inode);
+ root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe));
if (!root_dentry)
- goto out_put_workdir;
+ goto out_free_oe;
mntput(upperpath.mnt);
- mntput(lowerpath.mnt);
+ for (i = 0; i < numlower; i++)
+ mntput(stack[i].mnt);
path_put(&workpath);
+ kfree(lowertmp);
oe->__upperdentry = upperpath.dentry;
- oe->lowerdentry = lowerpath.dentry;
+ for (i = 0; i < numlower; i++) {
+ oe->lowerstack[i].dentry = stack[i].dentry;
+ oe->lowerstack[i].mnt = ufs->lower_mnt[i];
+ }
root_dentry->d_fsdata = oe;
@@ -782,20 +989,26 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
return 0;
+out_free_oe:
+ kfree(oe);
+out_put_lower_mnt:
+ for (i = 0; i < ufs->numlower; i++)
+ mntput(ufs->lower_mnt[i]);
+ kfree(ufs->lower_mnt);
out_put_workdir:
dput(ufs->workdir);
-out_put_lower_mnt:
- mntput(ufs->lower_mnt);
out_put_upper_mnt:
mntput(ufs->upper_mnt);
+out_put_lowerpath:
+ for (i = 0; i < numlower; i++)
+ path_put(&stack[i]);
+ kfree(stack);
+out_free_lowertmp:
+ kfree(lowertmp);
out_put_workpath:
path_put(&workpath);
-out_put_lowerpath:
- path_put(&lowerpath);
out_put_upperpath:
path_put(&upperpath);
-out_free_oe:
- kfree(oe);
out_free_config:
kfree(ufs->config.lowerdir);
kfree(ufs->config.upperdir);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 0855f77..3a48bb7 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -564,13 +564,11 @@ posix_acl_create(struct inode *dir, umode_t *mode,
*acl = posix_acl_clone(p, GFP_NOFS);
if (!*acl)
- return -ENOMEM;
+ goto no_mem;
ret = posix_acl_create_masq(*acl, mode);
- if (ret < 0) {
- posix_acl_release(*acl);
- return -ENOMEM;
- }
+ if (ret < 0)
+ goto no_mem_clone;
if (ret == 0) {
posix_acl_release(*acl);
@@ -591,6 +589,12 @@ no_acl:
*default_acl = NULL;
*acl = NULL;
return 0;
+
+no_mem_clone:
+ posix_acl_release(*acl);
+no_mem:
+ posix_acl_release(p);
+ return -ENOMEM;
}
EXPORT_SYMBOL_GPL(posix_acl_create);
@@ -772,7 +776,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
if (!IS_POSIXACL(dentry->d_inode))
return -EOPNOTSUPP;
- if (S_ISLNK(dentry->d_inode->i_mode))
+ if (d_is_symlink(dentry))
return -EOPNOTSUPP;
acl = get_acl(dentry->d_inode, type);
@@ -832,7 +836,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
if (!IS_POSIXACL(dentry->d_inode))
return -EOPNOTSUPP;
- if (S_ISLNK(dentry->d_inode->i_mode))
+ if (d_is_symlink(dentry))
return -EOPNOTSUPP;
if (type == ACL_TYPE_ACCESS)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 3309f59..be65b20 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -19,7 +19,6 @@
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/idr.h>
-#include <linux/namei.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
@@ -223,17 +222,6 @@ void proc_free_inum(unsigned int inum)
spin_unlock_irqrestore(&proc_inum_lock, flags);
}
-static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- nd_set_link(nd, __PDE_DATA(dentry->d_inode));
- return NULL;
-}
-
-static const struct inode_operations proc_link_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = proc_follow_link,
-};
-
/*
* Don't create negative dentries here, return -ENOENT by hand
* instead.
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 13a50a3..7697b66 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -23,6 +23,7 @@
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/magic.h>
+#include <linux/namei.h>
#include <asm/uaccess.h>
@@ -393,6 +394,26 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
};
#endif
+static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct proc_dir_entry *pde = PDE(dentry->d_inode);
+ if (unlikely(!use_pde(pde)))
+ return ERR_PTR(-EINVAL);
+ nd_set_link(nd, pde->data);
+ return pde;
+}
+
+static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+ unuse_pde(p);
+}
+
+const struct inode_operations proc_link_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = proc_follow_link,
+ .put_link = proc_put_link,
+};
+
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
struct inode *inode = new_inode_pseudo(sb);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 6fcdba5..c835b94 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -200,6 +200,7 @@ struct pde_opener {
int closing;
struct completion *c;
};
+extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 956b75d..6dee68d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1325,6 +1325,9 @@ out:
static int pagemap_open(struct inode *inode, struct file *file)
{
+ /* do not disclose physical addresses: attack vector */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
"to stop being page-shift some time soon. See the "
"linux/Documentation/vm/pagemap.txt for details.\n");
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 04b0614..4e781e6 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -266,7 +266,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
struct dentry *dentry = buf.dentries[i];
- if (!S_ISDIR(dentry->d_inode->i_mode))
+ if (!d_is_dir(dentry))
err = action(dentry, data);
dput(dentry);
@@ -322,7 +322,7 @@ static int delete_one_xattr(struct dentry *dentry, void *data)
struct inode *dir = dentry->d_parent->d_inode;
/* This is the xattr dir, handle specially. */
- if (S_ISDIR(dentry->d_inode->i_mode))
+ if (d_is_dir(dentry))
return xattr_rmdir(dir, dentry);
return xattr_unlink(dir, dentry);
diff --git a/fs/stat.c b/fs/stat.c
index ae0c3ce..19636af 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -66,7 +66,7 @@ int vfs_getattr(struct path *path, struct kstat *stat)
{
int retval;
- retval = security_inode_getattr(path->mnt, path->dentry);
+ retval = security_inode_getattr(path);
if (retval)
return retval;
return vfs_getattr_nosec(path, stat);
diff --git a/fs/super.c b/fs/super.c
index 65a53ef..2b7dc90 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,7 +71,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
- if (!grab_super_passive(sb))
+ if (!trylock_super(sb))
return SHRINK_STOP;
if (sb->s_op->nr_cached_objects)
@@ -105,7 +105,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
freed += sb->s_op->free_cached_objects(sb, sc);
}
- drop_super(sb);
+ up_read(&sb->s_umount);
return freed;
}
@@ -118,7 +118,7 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sb = container_of(shrink, struct super_block, s_shrink);
/*
- * Don't call grab_super_passive as it is a potential
+ * Don't call trylock_super as it is a potential
* scalability bottleneck. The counts could get updated
* between super_cache_count and super_cache_scan anyway.
* Call to super_cache_count with shrinker_rwsem held
@@ -348,35 +348,31 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
}
/*
- * grab_super_passive - acquire a passive reference
+ * trylock_super - try to grab ->s_umount shared
* @sb: reference we are trying to grab
*
- * Tries to acquire a passive reference. This is used in places where we
+ * Try to prevent fs shutdown. This is used in places where we
* cannot take an active reference but we need to ensure that the
- * superblock does not go away while we are working on it. It returns
- * false if a reference was not gained, and returns true with the s_umount
- * lock held in read mode if a reference is gained. On successful return,
- * the caller must drop the s_umount lock and the passive reference when
- * done.
+ * filesystem is not shut down while we are working on it. It returns
+ * false if we cannot acquire s_umount or if we lose the race and
+ * filesystem already got into shutdown, and returns true with the s_umount
+ * lock held in read mode in case of success. On successful return,
+ * the caller must drop the s_umount lock when done.
+ *
+ * Note that unlike get_super() et.al. this one does *not* bump ->s_count.
+ * The reason why it's safe is that we are OK with doing trylock instead
+ * of down_read(). There's a couple of places that are OK with that, but
+ * it's very much not a general-purpose interface.
*/
-bool grab_super_passive(struct super_block *sb)
+bool trylock_super(struct super_block *sb)
{
- spin_lock(&sb_lock);
- if (hlist_unhashed(&sb->s_instances)) {
- spin_unlock(&sb_lock);
- return false;
- }
-
- sb->s_count++;
- spin_unlock(&sb_lock);
-
if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root && (sb->s_flags & MS_BORN))
+ if (!hlist_unhashed(&sb->s_instances) &&
+ sb->s_root && (sb->s_flags & MS_BORN))
return true;
up_read(&sb->s_umount);
}
- put_super(sb);
return false;
}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d617999..df68285 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_PROC_FS) += xfs_stats.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
+xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5eb4a14..b97359b 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -30,6 +30,7 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
+#include "xfs_pnfs.h"
/*
* Note that we only accept fileids which are long enough rather than allow
@@ -245,4 +246,9 @@ const struct export_operations xfs_export_operations = {
.fh_to_parent = xfs_fs_fh_to_parent,
.get_parent = xfs_fs_get_parent,
.commit_metadata = xfs_fs_nfs_commit_metadata,
+#ifdef CONFIG_NFSD_PNFS
+ .get_uuid = xfs_fs_get_uuid,
+ .map_blocks = xfs_fs_map_blocks,
+ .commit_blocks = xfs_fs_commit_blocks,
+#endif
};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f527618..f44212f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_icache.h"
+#include "xfs_pnfs.h"
#include <linux/dcache.h>
#include <linux/falloc.h>
@@ -395,7 +396,8 @@ STATIC int /* error (positive) */
xfs_zero_last_block(
struct xfs_inode *ip,
xfs_fsize_t offset,
- xfs_fsize_t isize)
+ xfs_fsize_t isize,
+ bool *did_zeroing)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
@@ -423,6 +425,7 @@ xfs_zero_last_block(
zero_len = mp->m_sb.sb_blocksize - zero_offset;
if (isize + zero_len > offset)
zero_len = offset - isize;
+ *did_zeroing = true;
return xfs_iozero(ip, isize, zero_len);
}
@@ -441,7 +444,8 @@ int /* error (positive) */
xfs_zero_eof(
struct xfs_inode *ip,
xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize) /* current inode size */
+ xfs_fsize_t isize, /* current inode size */
+ bool *did_zeroing)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t start_zero_fsb;
@@ -463,7 +467,7 @@ xfs_zero_eof(
* We only zero a part of that block so it is handled specially.
*/
if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
- error = xfs_zero_last_block(ip, offset, isize);
+ error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
if (error)
return error;
}
@@ -523,6 +527,7 @@ xfs_zero_eof(
if (error)
return error;
+ *did_zeroing = true;
start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
}
@@ -553,6 +558,10 @@ restart:
if (error)
return error;
+ error = xfs_break_layouts(inode, iolock);
+ if (error)
+ return error;
+
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
@@ -561,13 +570,15 @@ restart:
* having to redo all checks before.
*/
if (*pos > i_size_read(inode)) {
+ bool zero = false;
+
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, *iolock);
goto restart;
}
- error = xfs_zero_eof(ip, *pos, i_size_read(inode));
+ error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
if (error)
return error;
}
@@ -821,6 +832,7 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
enum xfs_prealloc_flags flags = 0;
+ uint iolock = XFS_IOLOCK_EXCL;
loff_t new_size = 0;
if (!S_ISREG(inode->i_mode))
@@ -829,7 +841,11 @@ xfs_file_fallocate(
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(inode, &iolock);
+ if (error)
+ goto out_unlock;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
@@ -893,7 +909,7 @@ xfs_file_fallocate(
}
out_unlock:
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, iolock);
return error;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fba6532..74efe5b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -602,6 +602,12 @@ xfs_growfs_data(
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
error = xfs_growfs_data_private(mp, in);
+ /*
+ * Increment the generation unconditionally, the error could be from
+ * updating the secondary superblocks, in which case the new size
+ * is live already.
+ */
+ mp->m_generation++;
mutex_unlock(&mp->m_growlock);
return error;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index daafa1f..6163767 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2867,6 +2867,10 @@ xfs_rename(
* Handle RENAME_EXCHANGE flags
*/
if (flags & RENAME_EXCHANGE) {
+ if (target_ip == NULL) {
+ error = -EINVAL;
+ goto error_return;
+ }
error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
target_dp, target_name, target_ip,
&free_list, &first_block, spaceres);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 86cd6b3..a1cd55f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -384,10 +384,11 @@ enum xfs_prealloc_flags {
XFS_PREALLOC_INVISIBLE = (1 << 4),
};
-int xfs_update_prealloc_flags(struct xfs_inode *,
- enum xfs_prealloc_flags);
-int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-int xfs_iozero(struct xfs_inode *, loff_t, size_t);
+int xfs_update_prealloc_flags(struct xfs_inode *ip,
+ enum xfs_prealloc_flags flags);
+int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_fsize_t isize, bool *did_zeroing);
+int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
#define IHOLD(ip) \
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f7afb86..ac4feae 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -39,6 +39,7 @@
#include "xfs_icache.h"
#include "xfs_symlink.h"
#include "xfs_trans.h"
+#include "xfs_pnfs.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -286,7 +287,7 @@ xfs_readlink_by_handle(
return PTR_ERR(dentry);
/* Restrict this handle operation to symlinks only. */
- if (!S_ISLNK(dentry->d_inode->i_mode)) {
+ if (!d_is_symlink(dentry)) {
error = -EINVAL;
goto out_dput;
}
@@ -608,6 +609,7 @@ xfs_ioc_space(
{
struct iattr iattr;
enum xfs_prealloc_flags flags = 0;
+ uint iolock = XFS_IOLOCK_EXCL;
int error;
/*
@@ -636,7 +638,10 @@ xfs_ioc_space(
if (error)
return error;
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(inode, &iolock);
+ if (error)
+ goto out_unlock;
switch (bf->l_whence) {
case 0: /*SEEK_SET*/
@@ -725,7 +730,7 @@ xfs_ioc_space(
error = xfs_update_prealloc_flags(ip, flags);
out_unlock:
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(ip, iolock);
mnt_drop_write_file(filp);
return error;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ce80eeb..e53a903 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -37,6 +37,7 @@
#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_trans_space.h"
+#include "xfs_pnfs.h"
#include <linux/capability.h>
#include <linux/xattr.h>
@@ -505,7 +506,7 @@ xfs_setattr_mode(
inode->i_mode |= mode & ~S_IFMT;
}
-static void
+void
xfs_setattr_time(
struct xfs_inode *ip,
struct iattr *iattr)
@@ -750,6 +751,7 @@ xfs_setattr_size(
int error;
uint lock_flags = 0;
uint commit_flags = 0;
+ bool did_zeroing = false;
trace_xfs_setattr(ip);
@@ -793,20 +795,16 @@ xfs_setattr_size(
return error;
/*
- * Now we can make the changes. Before we join the inode to the
- * transaction, take care of the part of the truncation that must be
- * done without the inode lock. This needs to be done before joining
- * the inode to the transaction, because the inode cannot be unlocked
- * once it is a part of the transaction.
+ * File data changes must be complete before we start the transaction to
+ * modify the inode. This needs to be done before joining the inode to
+ * the transaction because the inode cannot be unlocked once it is a
+ * part of the transaction.
+ *
+ * Start with zeroing any data block beyond EOF that we may expose on
+ * file extension.
*/
if (newsize > oldsize) {
- /*
- * Do the first part of growing a file: zero any data in the
- * last block that is beyond the old EOF. We need to do this
- * before the inode is joined to the transaction to modify
- * i_size.
- */
- error = xfs_zero_eof(ip, newsize, oldsize);
+ error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
if (error)
return error;
}
@@ -816,23 +814,18 @@ xfs_setattr_size(
* any previous writes that are beyond the on disk EOF and the new
* EOF that have not been written out need to be written here. If we
* do not write the data out, we expose ourselves to the null files
- * problem.
- *
- * Only flush from the on disk size to the smaller of the in memory
- * file size or the new size as that's the range we really care about
- * here and prevents waiting for other data not within the range we
- * care about here.
+ * problem. Note that this includes any block zeroing we did above;
+ * otherwise those blocks may not be zeroed after a crash.
*/
- if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+ if (newsize > ip->i_d.di_size &&
+ (oldsize != ip->i_d.di_size || did_zeroing)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
return error;
}
- /*
- * Wait for all direct I/O to complete.
- */
+ /* Now wait for all direct I/O to complete. */
inode_dio_wait(inode);
/*
@@ -979,9 +972,13 @@ xfs_vn_setattr(
int error;
if (iattr->ia_valid & ATTR_SIZE) {
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- error = xfs_setattr_size(ip, iattr);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ uint iolock = XFS_IOLOCK_EXCL;
+
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(dentry->d_inode, &iolock);
+ if (!error)
+ error = xfs_setattr_size(ip, iattr);
+ xfs_iunlock(ip, iolock);
} else {
error = xfs_setattr_nonsize(ip, iattr, 0);
}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 1c34e43..ea7a98e 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
*/
#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
+extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
int flags);
extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5b2ff8..0d8abd6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -174,6 +174,17 @@ typedef struct xfs_mount {
struct workqueue_struct *m_reclaim_workqueue;
struct workqueue_struct *m_log_workqueue;
struct workqueue_struct *m_eofblocks_workqueue;
+
+ /*
+ * Generation of the filesysyem layout. This is incremented by each
+ * growfs, and used by the pNFS server to ensure the client updates
+ * its view of the block device once it gets a layout that might
+ * reference the newly added blocks. Does not need to be persistent
+ * as long as we only allow file system size increments, but if we
+ * ever support shrinks it would have to be persisted in addition
+ * to various other kinds of pain inflicted on the pNFS server.
+ */
+ __uint32_t m_generation;
} xfs_mount_t;
/*
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
new file mode 100644
index 0000000..365dd57
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_pnfs.h"
+
+/*
+ * Ensure that we do not have any outstanding pNFS layouts that can be used by
+ * clients to directly read from or write to this inode. This must be called
+ * before every operation that can remove blocks from the extent map.
+ * Additionally we call it during the write operation, where aren't concerned
+ * about exposing unallocated blocks but just want to provide basic
+ * synchronization between a local writer and pNFS clients. mmap writes would
+ * also benefit from this sort of synchronization, but due to the tricky locking
+ * rules in the page fault path we don't bother.
+ */
+int
+xfs_break_layouts(
+ struct inode *inode,
+ uint *iolock)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+
+ while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
+ xfs_iunlock(ip, *iolock);
+ error = break_layout(inode, true);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_ilock(ip, *iolock);
+ }
+
+ return error;
+}
+
+/*
+ * Get a unique ID including its location so that the client can identify
+ * the exported device.
+ */
+int
+xfs_fs_get_uuid(
+ struct super_block *sb,
+ u8 *buf,
+ u32 *len,
+ u64 *offset)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ printk_once(KERN_NOTICE
+"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
+ mp->m_fsname);
+
+ if (*len < sizeof(uuid_t))
+ return -EINVAL;
+
+ memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+ *len = sizeof(uuid_t);
+ *offset = offsetof(struct xfs_dsb, sb_uuid);
+ return 0;
+}
+
+static void
+xfs_bmbt_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ struct xfs_bmbt_irec *imap)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (imap->br_startblock == HOLESTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_HOLE;
+ } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->type = IOMAP_DELALLOC;
+ } else {
+ iomap->blkno =
+ XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
+ if (imap->br_state == XFS_EXT_UNWRITTEN)
+ iomap->type = IOMAP_UNWRITTEN;
+ else
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+ iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+}
+
+/*
+ * Get a layout for the pNFS client.
+ */
+int
+xfs_fs_map_blocks(
+ struct inode *inode,
+ loff_t offset,
+ u64 length,
+ struct iomap *iomap,
+ bool write,
+ u32 *device_generation)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ loff_t limit;
+ int bmapi_flags = XFS_BMAPI_ENTIRE;
+ int nimaps = 1;
+ uint lock_flags;
+ int error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /*
+ * We can't export inodes residing on the realtime device. The realtime
+ * device doesn't have a UUID to identify it, so the client has no way
+ * to find it.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return -ENXIO;
+
+ /*
+ * Lock out any other I/O before we flush and invalidate the pagecache,
+ * and then hand out a layout to the remote system. This is very
+ * similar to direct I/O, except that the synchronization is much more
+ * complicated. See the comment near xfs_break_layouts for a detailed
+ * explanation.
+ */
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ error = -EINVAL;
+ limit = mp->m_super->s_maxbytes;
+ if (!write)
+ limit = max(limit, round_up(i_size_read(inode),
+ inode->i_sb->s_blocksize));
+ if (offset > limit)
+ goto out_unlock;
+ if (offset > limit - length)
+ length = limit - offset;
+
+ error = filemap_write_and_wait(inode->i_mapping);
+ if (error)
+ goto out_unlock;
+ error = invalidate_inode_pages2(inode->i_mapping);
+ if (WARN_ON_ONCE(error))
+ return error;
+
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+ lock_flags = xfs_ilock_data_map_shared(ip);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap, &nimaps, bmapi_flags);
+ xfs_iunlock(ip, lock_flags);
+
+ if (error)
+ goto out_unlock;
+
+ if (write) {
+ enum xfs_prealloc_flags flags = 0;
+
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+ if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+ error = xfs_iomap_write_direct(ip, offset, length,
+ &imap, nimaps);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Ensure the next transaction is committed
+ * synchronously so that the blocks allocated and
+ * handed out to the client are guaranteed to be
+ * present even after a server crash.
+ */
+ flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
+ }
+
+ error = xfs_update_prealloc_flags(ip, flags);
+ if (error)
+ goto out_unlock;
+ }
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ *device_generation = mp->m_generation;
+ return error;
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+/*
+ * Ensure the size update falls into a valid allocated block.
+ */
+static int
+xfs_pnfs_validate_isize(
+ struct xfs_inode *ip,
+ xfs_off_t isize)
+{
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1;
+ int error = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
+ &imap, &nimaps, 0);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (error)
+ return error;
+
+ if (imap.br_startblock == HOLESTARTBLOCK ||
+ imap.br_startblock == DELAYSTARTBLOCK ||
+ imap.br_state == XFS_EXT_UNWRITTEN)
+ return -EIO;
+ return 0;
+}
+
+/*
+ * Make sure the blocks described by maps are stable on disk. This includes
+ * converting any unwritten extents, flushing the disk cache and updating the
+ * time stamps.
+ *
+ * Note that we rely on the caller to always send us a timestamp update so that
+ * we always commit a transaction here. If that stops being true we will have
+ * to manually flush the cache here similar to what the fsync code path does
+ * for datasyncs on files that have no dirty metadata.
+ */
+int
+xfs_fs_commit_blocks(
+ struct inode *inode,
+ struct iomap *maps,
+ int nr_maps,
+ struct iattr *iattr)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ bool update_isize = false;
+ int error, i;
+ loff_t size;
+
+ ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ size = i_size_read(inode);
+ if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
+ update_isize = true;
+ size = iattr->ia_size;
+ }
+
+ for (i = 0; i < nr_maps; i++) {
+ u64 start, length, end;
+
+ start = maps[i].offset;
+ if (start > size)
+ continue;
+
+ end = start + maps[i].length;
+ if (end > size)
+ end = size;
+
+ length = end - start;
+ if (!length)
+ continue;
+
+ /*
+ * Make sure reads through the pagecache see the new data.
+ */
+ error = invalidate_inode_pages2_range(inode->i_mapping,
+ start >> PAGE_CACHE_SHIFT,
+ (end - 1) >> PAGE_CACHE_SHIFT);
+ WARN_ON_ONCE(error);
+
+ error = xfs_iomap_write_unwritten(ip, start, length);
+ if (error)
+ goto out_drop_iolock;
+ }
+
+ if (update_isize) {
+ error = xfs_pnfs_validate_isize(ip, size);
+ if (error)
+ goto out_drop_iolock;
+ }
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_drop_iolock;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ xfs_setattr_time(ip, iattr);
+ if (update_isize) {
+ i_size_write(inode, iattr->ia_size);
+ ip->i_d.di_size = iattr->ia_size;
+ }
+
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
+
+out_drop_iolock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
new file mode 100644
index 0000000..b7fbfce
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,18 @@
+#ifndef _XFS_PNFS_H
+#define _XFS_PNFS_H 1
+
+#ifdef CONFIG_NFSD_PNFS
+int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
+ struct iomap *iomap, bool write, u32 *device_generation);
+int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
+ struct iattr *iattr);
+
+int xfs_break_layouts(struct inode *inode, uint *iolock);
+#else
+static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
+{
+ return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _XFS_PNFS_H */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 53cc2aa..fbbb9e6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -836,6 +836,11 @@ xfs_qm_reset_dqcounts(
*/
xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
"xfs_quotacheck");
+ /*
+ * Reset type in case we are reusing group quota file for
+ * project quotas or vice versa
+ */
+ ddq->d_flags = type;
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;
OpenPOWER on IntegriCloud