diff options
author | Jens Axboe <jaxboe@fusionio.com> | 2010-06-01 12:42:12 +0200 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2010-06-01 12:42:12 +0200 |
commit | b4ca761577535b2b4d153689ee97342797dfff05 (patch) | |
tree | 29054d55508f1faa22ec32acf7c245751af03348 /fs | |
parent | 28f4197e5d4707311febeec8a0eb97cb5fd93c97 (diff) | |
parent | 67a3e12b05e055c0415c556a315a3d3eb637e29e (diff) | |
download | op-kernel-dev-b4ca761577535b2b4d153689ee97342797dfff05.zip op-kernel-dev-b4ca761577535b2b4d153689ee97342797dfff05.tar.gz |
Merge branch 'master' into for-linus
Conflicts:
fs/pipe.c
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'fs')
275 files changed, 11458 insertions, 6190 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index ed83583..32ef400 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,7 +40,9 @@ extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; +extern const struct file_operations v9fs_file_operations_dotl; extern const struct file_operations v9fs_dir_operations; +extern const struct file_operations v9fs_dir_operations_dotl; extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 0adfd64..d61e3b2 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = { .open = v9fs_file_open, .release = v9fs_dir_release, }; + +const struct file_operations v9fs_dir_operations_dotl = { + .read = generic_read_dir, + .llseek = generic_file_llseek, + .readdir = v9fs_dir_readdir, + .open = v9fs_file_open, + .release = v9fs_dir_release, +}; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index df52d48..2bedc6c 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data, return total; } -static int v9fs_file_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int v9fs_file_fsync(struct file *filp, int datasync) { struct p9_fid *fid; struct p9_wstat wstat; int retval; - P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp, - dentry, datasync); + P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); @@ -296,3 +294,14 @@ const struct file_operations v9fs_file_operations = { .mmap = generic_file_readonly_mmap, .fsync = v9fs_file_fsync, }; + +const struct file_operations v9fs_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = v9fs_file_read, + .write = v9fs_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = generic_file_readonly_mmap, + .fsync = v9fs_file_fsync, +}; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 6d4d861..4331b3b 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -44,9 +44,12 @@ #include "cache.h" static const struct inode_operations v9fs_dir_inode_operations; -static const struct inode_operations v9fs_dir_inode_operations_ext; +static const struct inode_operations v9fs_dir_inode_operations_dotu; +static const struct inode_operations v9fs_dir_inode_operations_dotl; static const struct inode_operations v9fs_file_inode_operations; +static const struct inode_operations v9fs_file_inode_operations_dotl; static const struct inode_operations v9fs_symlink_inode_operations; +static const struct inode_operations v9fs_symlink_inode_operations_dotl; /** * unixmode2p9mode - convert unix mode bits to plan 9 @@ -273,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) init_special_inode(inode, inode->i_mode, inode->i_rdev); break; case S_IFREG: - inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; + } else { + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + } + break; + case S_IFLNK: - if (!v9fs_proto_dotu(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "extended modes used w/o 9P2000.u\n"); + if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " + "legacy protocol.\n"); err = -EINVAL; goto error; } - inode->i_op = &v9fs_symlink_inode_operations; + + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_symlink_inode_operations_dotl; + else + inode->i_op = &v9fs_symlink_inode_operations; + break; case S_IFDIR: inc_nlink(inode); - if (v9fs_proto_dotu(v9ses)) - inode->i_op = &v9fs_dir_inode_operations_ext; + if (v9fs_proto_dotl(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotl; + else if (v9fs_proto_dotu(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_dotu; else inode->i_op = &v9fs_dir_inode_operations; - inode->i_fop = &v9fs_dir_operations; + + if (v9fs_proto_dotl(v9ses)) + inode->i_fop = &v9fs_dir_operations_dotl; + else + inode->i_fop = &v9fs_dir_operations; + break; default: P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", @@ -432,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) { int retval; struct inode *file_inode; - struct v9fs_session_info *v9ses; struct p9_fid *v9fid; P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, rmdir); file_inode = file->d_inode; - v9ses = v9fs_inode2v9ses(file_inode); v9fid = v9fs_fid_clone(file); if (IS_ERR(v9fid)) return PTR_ERR(v9fid); @@ -482,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = NULL; fid = NULL; name = (char *) dentry->d_name.name; - dfid = v9fs_fid_clone(dentry->d_parent); + dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); - P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); - dfid = NULL; - goto error; + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return ERR_PTR(err); } /* clone a fid to use for creation */ @@ -495,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); - ofid = NULL; - goto error; + return ERR_PTR(err); } err = p9_client_fcreate(ofid, name, perm, mode, extension); @@ -506,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, } /* now walk from the parent so we can get unopened fid */ - fid = p9_client_walk(dfid, 1, &name, 0); + fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; - } else - dfid = NULL; + } /* instantiate inode and assign the unopened fid to the dentry */ inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); @@ -536,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, return ofid; error: - if (dfid) - p9_client_clunk(dfid); - if (ofid) p9_client_clunk(ofid); @@ -673,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(fid)) { result = PTR_ERR(fid); if (result == -ENOENT) { - d_add(dentry, NULL); - return NULL; + inode = NULL; + goto inst_out; } return ERR_PTR(result); @@ -691,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (result < 0) goto error; - if ((fid->qid.version) && (v9ses->cache)) +inst_out: + if (v9ses->cache) dentry->d_op = &v9fs_cached_dentry_operations; else dentry->d_op = &v9fs_dentry_operations; @@ -770,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto clunk_olddir; } + if (v9fs_proto_dotl(v9ses)) { + retval = p9_client_rename(oldfid, newdirfid, + (char *) new_dentry->d_name.name); + if (retval != -ENOSYS) + goto clunk_newdir; + } + /* 9P can only handle file rename in the same directory */ if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { P9_DPRINTK(P9_DEBUG_ERROR, @@ -1195,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); else if (S_ISFIFO(mode)) *name = 0; + else if (S_ISSOCK(mode)) + *name = 0; else { __putname(name); return -EINVAL; @@ -1206,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } -static const struct inode_operations v9fs_dir_inode_operations_ext = { +static const struct inode_operations v9fs_dir_inode_operations_dotu = { + .create = v9fs_vfs_create, + .lookup = v9fs_vfs_lookup, + .symlink = v9fs_vfs_symlink, + .link = v9fs_vfs_link, + .unlink = v9fs_vfs_unlink, + .mkdir = v9fs_vfs_mkdir, + .rmdir = v9fs_vfs_rmdir, + .mknod = v9fs_vfs_mknod, + .rename = v9fs_vfs_rename, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + +static const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, .symlink = v9fs_vfs_symlink, @@ -1237,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = { .setattr = v9fs_vfs_setattr, }; +static const struct inode_operations v9fs_file_inode_operations_dotl = { + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; + static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, @@ -1244,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = { .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; + +static const struct inode_operations v9fs_symlink_inode_operations_dotl = { + .readlink = generic_readlink, + .follow_link = v9fs_vfs_follow_link, + .put_link = v9fs_vfs_put_link, + .getattr = v9fs_vfs_getattr, + .setattr = v9fs_vfs_setattr, +}; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 806da5d..be74d02 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -38,6 +38,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/statfs.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -45,7 +46,7 @@ #include "v9fs_vfs.h" #include "fid.h" -static const struct super_operations v9fs_super_ops; +static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; /** * v9fs_set_super - set the superblock @@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; - sb->s_op = &v9fs_super_ops; + if (v9fs_proto_dotl(v9ses)) + sb->s_op = &v9fs_super_ops_dotl; + else + sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | @@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb) v9fs_session_begin_cancel(v9ses); } +static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_rstatfs rs; + int res; + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) { + res = PTR_ERR(fid); + goto done; + } + + v9ses = v9fs_inode2v9ses(dentry->d_inode); + if (v9fs_proto_dotl(v9ses)) { + res = p9_client_statfs(fid, &rs); + if (res == 0) { + buf->f_type = rs.type; + buf->f_bsize = rs.bsize; + buf->f_blocks = rs.blocks; + buf->f_bfree = rs.bfree; + buf->f_bavail = rs.bavail; + buf->f_files = rs.files; + buf->f_ffree = rs.ffree; + buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL; + buf->f_namelen = rs.namelen; + } + if (res != -ENOSYS) + goto done; + } + res = simple_statfs(dentry, buf); +done: + return res; +} + static const struct super_operations v9fs_super_ops = { #ifdef CONFIG_9P_FSCACHE .alloc_inode = v9fs_alloc_inode, @@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = { .umount_begin = v9fs_umount_begin, }; +static const struct super_operations v9fs_super_ops_dotl = { +#ifdef CONFIG_9P_FSCACHE + .alloc_inode = v9fs_alloc_inode, + .destroy_inode = v9fs_destroy_inode, +#endif + .statfs = v9fs_statfs, + .clear_inode = v9fs_clear_inode, + .show_options = generic_show_options, + .umount_begin = v9fs_umount_begin, +}; + struct file_system_type v9fs_fs_type = { .name = "9p", .get_sb = v9fs_get_sb, diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 23aa52f..f4287e4 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, .readdir = adfs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static int diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 005ea34..a36da53 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .write = do_sync_write, .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 0f5e309..6f850b0 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (error) goto out; + /* XXX: this is missing some actual on-disk truncation.. */ if (ia_valid & ATTR_SIZE) - error = vmtruncate(inode, attr->ia_size); + error = simple_setsize(inode, attr->ia_size); if (error) goto out; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 861dae6..f05b615 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -183,7 +183,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent void affs_free_prealloc(struct inode *inode); extern void affs_truncate(struct inode *); -int affs_file_fsync(struct file *, struct dentry *, int); +int affs_file_fsync(struct file *, int); /* dir.c */ diff --git a/fs/affs/file.c b/fs/affs/file.c index 184e55c..322710c 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -916,9 +916,9 @@ affs_truncate(struct inode *inode) affs_free_prealloc(inode); } -int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) +int affs_file_fsync(struct file *filp, int datasync) { - struct inode * inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; int ret, err; ret = write_inode_now(inode, 0); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d70bbba..914d1c0 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) affs_brelse(bh); inode = affs_iget(sb, ino); if (IS_ERR(inode)) - return ERR_PTR(PTR_ERR(inode)); + return ERR_CAST(inode); } dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; d_add(dentry, inode); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 807f284..5f679b77 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -740,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern int afs_writeback_all(struct afs_vnode *); -extern int afs_fsync(struct file *, struct dentry *, int); +extern int afs_fsync(struct file *, int); /*****************************************************************************/ diff --git a/fs/afs/write.c b/fs/afs/write.c index 3bed54a..3dab9e9 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode) * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. */ -int afs_fsync(struct file *file, struct dentry *dentry, int datasync) +int afs_fsync(struct file *file, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct afs_writeback *wb, *xwb; struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); int ret; @@ -36,6 +36,7 @@ #include <linux/blkdev.h> #include <linux/mempool.h> #include <linux/hash.h> +#include <linux/compat.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> @@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data) /* Complete the fput(s) */ if (req->ki_filp != NULL) - __fput(req->ki_filp); + fput(req->ki_filp); /* Link the iocb into the context's free list */ spin_lock_irq(&ctx->ctx_lock); @@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) /* * Try to optimize the aio and eventfd file* puts, by avoiding to - * schedule work in case it is not __fput() time. In normal cases, + * schedule work in case it is not final fput() time. In normal cases, * we would not be holding the last reference to the file*, so * this function will be executed w/out any aio kthread wakeup. */ - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(!fput_atomic(req->ki_filp))) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); @@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb) return ret; } -static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) +static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) { ssize_t ret; - ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, - &kiocb->ki_inline_vec, &kiocb->ki_iovec); +#ifdef CONFIG_COMPAT + if (compat) + ret = compat_rw_copy_check_uvector(type, + (struct compat_iovec __user *)kiocb->ki_buf, + kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + &kiocb->ki_iovec); + else +#endif + ret = rw_copy_check_uvector(type, + (struct iovec __user *)kiocb->ki_buf, + kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + &kiocb->ki_iovec); if (ret < 0) goto out; @@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb) * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_setup_iocb(struct kiocb *kiocb) +static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) { struct file *file = kiocb->ki_filp; ssize_t ret = 0; @@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) ret = security_file_permission(file, MAY_READ); if (unlikely(ret)) break; - ret = aio_setup_vectored_rw(READ, kiocb); + ret = aio_setup_vectored_rw(READ, kiocb, compat); if (ret) break; ret = -EINVAL; @@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb) ret = security_file_permission(file, MAY_WRITE); if (unlikely(ret)) break; - ret = aio_setup_vectored_rw(WRITE, kiocb); + ret = aio_setup_vectored_rw(WRITE, kiocb, compat); if (ret) break; ret = -EINVAL; @@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash) } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, struct hlist_head *batch_hash) + struct iocb *iocb, struct hlist_head *batch_hash, + bool compat) { struct kiocb *req; struct file *file; @@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_setup_iocb(req); + ret = aio_setup_iocb(req, compat); if (ret) goto out_put_req; @@ -1637,20 +1648,8 @@ out_put_req: return ret; } -/* sys_io_submit: - * Queue the nr iocbs pointed to by iocbpp for processing. Returns - * the number of iocbs queued. May return -EINVAL if the aio_context - * specified by ctx_id is invalid, if nr is < 0, if the iocb at - * *iocbpp[0] is not properly initialized, if the operation specified - * is invalid for the file descriptor in the iocb. May fail with - * -EFAULT if any of the data structures point to invalid data. May - * fail with -EBADF if the file descriptor specified in the first - * iocb is invalid. May fail with -EAGAIN if insufficient resources - * are available to queue any iocbs. Will return 0 if nr is 0. Will - * fail with -ENOSYS if not implemented. - */ -SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, - struct iocb __user * __user *, iocbpp) +long do_io_submit(aio_context_t ctx_id, long nr, + struct iocb __user *__user *iocbpp, bool compat) { struct kioctx *ctx; long ret = 0; @@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); + ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat); if (ret) break; } @@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, return i ? i : ret; } +/* sys_io_submit: + * Queue the nr iocbs pointed to by iocbpp for processing. Returns + * the number of iocbs queued. May return -EINVAL if the aio_context + * specified by ctx_id is invalid, if nr is < 0, if the iocb at + * *iocbpp[0] is not properly initialized, if the operation specified + * is invalid for the file descriptor in the iocb. May fail with + * -EFAULT if any of the data structures point to invalid data. May + * fail with -EBADF if the file descriptor specified in the first + * iocb is invalid. May fail with -EAGAIN if insufficient resources + * are available to queue any iocbs. Will return 0 if nr is 0. Will + * fail with -ENOSYS if not implemented. + */ +SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, + struct iocb __user * __user *, iocbpp) +{ + return do_io_submit(ctx_id, nr, iocbpp, 0); +} + /* lookup_kiocb * Finds a given iocb for cancellation. */ diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 9bd4b38..e4b75d6 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void) * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; - inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; + inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; @@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok); * @offset: the new size to assign to the inode * @Returns: 0 on success, -ve errno on failure * + * inode_newsize_ok must be called with i_mutex held. + * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ * when necessary. Caller must not proceed with inode size change if failure is * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). - * - * inode_newsize_ok must be called with i_mutex held. */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { @@ -104,17 +104,25 @@ out_big: } EXPORT_SYMBOL(inode_newsize_ok); -int inode_setattr(struct inode * inode, struct iattr * attr) +/** + * generic_setattr - copy simple metadata updates into the generic inode + * @inode: the inode to be updated + * @attr: the new attributes + * + * generic_setattr must be called with i_mutex held. + * + * generic_setattr updates the inode's metadata with that specified + * in attr. Noticably missing is inode size update, which is more complex + * as it requires pagecache updates. See simple_setsize. + * + * The inode is not marked as dirty after this operation. The rationale is + * that for "simple" filesystems, the struct inode is the inode storage. + * The caller is free to mark the inode dirty afterwards if needed. + */ +void generic_setattr(struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_SIZE && - attr->ia_size != i_size_read(inode)) { - int error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - if (ia_valid & ATTR_UID) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) @@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr) mode &= ~S_ISGID; inode->i_mode = mode; } +} +EXPORT_SYMBOL(generic_setattr); + +/* + * note this function is deprecated, the new truncate sequence should be + * used instead -- see eg. simple_setsize, generic_setattr. + */ +int inode_setattr(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_SIZE && + attr->ia_size != i_size_read(inode)) { + int error; + + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + generic_setattr(inode, attr); + mark_inode_dirty(inode); return 0; diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 8713c7c..9a0520b 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int); static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); const struct file_operations autofs_root_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = autofs_root_readdir, .ioctl = autofs_root_ioctl, diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index d29b7f6..ba4a38b 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) */ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { - struct autofs_dev_ioctl tmp, *ads; + struct autofs_dev_ioctl tmp; if (copy_from_user(&tmp, in, sizeof(tmp))) return ERR_PTR(-EFAULT); @@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i if (tmp.size < sizeof(tmp)) return ERR_PTR(-EINVAL); - ads = kmalloc(tmp.size, GFP_KERNEL); - if (!ads) - return ERR_PTR(-ENOMEM); - - if (copy_from_user(ads, in, tmp.size)) { - kfree(ads); - return ERR_PTR(-EFAULT); - } - - return ads; + return memdup_user(in, tmp.size); } static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) @@ -736,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = { }; static struct miscdevice _autofs_dev_ioctl_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, .fops = &_dev_ioctl_fops }; +MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); +MODULE_ALIAS("devname:autofs"); + /* Register/deregister misc character device */ int autofs_dev_ioctl_init(void) { diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index e8e5e63..db4117e 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -18,13 +18,14 @@ #include <linux/slab.h> #include <linux/param.h> #include <linux/time.h> +#include <linux/smp_lock.h> #include "autofs_i.h" static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); -static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); +static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); static void *autofs4_follow_link(struct dentry *, struct nameidata *); @@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = { .read = generic_read_dir, .readdir = dcache_readdir, .llseek = dcache_dir_lseek, - .ioctl = autofs4_root_ioctl, + .unlocked_ioctl = autofs4_root_ioctl, }; const struct file_operations autofs4_dir_operations = { @@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry) * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ -static int autofs4_root_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) { struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); void __user *p = (void __user *)arg; @@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, return -ENOSYS; } } + +static long autofs4_root_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + long ret; + struct inode *inode = filp->f_dentry->d_inode; + + lock_kernel(); + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + unlock_kernel(); + + return ret; +} diff --git a/fs/bad_inode.c b/fs/bad_inode.c index a05287a..52e59bf 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp) return -EIO; } -static int bad_file_fsync(struct file *file, struct dentry *dentry, - int datasync) +static int bad_file_fsync(struct file *file, int datasync) { return -EIO; } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 8f73841..d967e05 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .readdir = bfs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/block_dev.c b/fs/block_dev.c index 26e5f50..7346c96 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), - iov, offset, nr_segs, blkdev_get_blocks, NULL); + return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode, + I_BDEV(inode), iov, offset, nr_segs, + blkdev_get_blocks, NULL); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -309,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - blkdev_get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, blkdev_get_block); } static int blkdev_write_end(struct file *file, struct address_space *mapping, @@ -358,12 +359,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) return retval; } -/* - * Filp is never NULL; the only case when ->fsync() is called with - * NULL first argument is nfsd_sync_dir() and that's not a directory. - */ - -int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) +int blkdev_fsync(struct file *filp, int datasync) { struct inode *bd_inode = filp->f_mapping->host; struct block_device *bdev = I_BDEV(bd_inode); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 462859a3..7ec1409 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -377,6 +377,7 @@ again: if (!list_empty(&worker->pending) || !list_empty(&worker->prio_pending)) { spin_unlock_irq(&worker->lock); + set_current_state(TASK_RUNNING); goto again; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7a4dee1..6ad63f1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -137,8 +137,8 @@ struct btrfs_inode { * of extent items we've reserved metadata for. */ spinlock_t accounting_lock; + atomic_t outstanding_extents; int reserved_extents; - int outstanding_extents; /* * ordered_data_close is set by truncate when a file that used @@ -151,6 +151,7 @@ struct btrfs_inode { * of these. */ unsigned ordered_data_close:1; + unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; /* diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6795a71..0d1d966 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow) + struct extent_buffer *cow, + int *last_ref) { u64 refs; u64 owner; @@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, BUG_ON(ret); } clean_tree_block(trans, root, buf); + *last_ref = 1; } return 0; } @@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct extent_buffer *cow; int level; + int last_ref = 0; int unlock_orig = 0; u64 parent_start; @@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow); + update_ref_for_cow(trans, root, buf, cow, &last_ref); + + if (root->ref_cows) + btrfs_reloc_cow_block(trans, root, buf, cow); if (buf == root->node) { WARN_ON(parent && parent != buf); @@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, extent_buffer_get(cow); spin_unlock(&root->node_lock); - btrfs_free_tree_block(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, level); + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - btrfs_free_tree_block(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, level); + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, return bin_search(eb, key, level, slot); } +static void root_add_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) + size); + spin_unlock(&root->accounting_lock); +} + +static void root_sub_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) - size); + spin_unlock(&root->accounting_lock); +} + /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). * NULL is returned on error. @@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + goto enospc; + } spin_lock(&root->node_lock); root->node = child; @@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); - ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, - 0, root->root_key.objectid, level); + + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ free_extent_buffer(mid); - return ret; + return 0; } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(root) / 4) @@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - u64 bytenr = right->start; - u32 blocksize = right->len; - clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - free_extent_buffer(right); - right = NULL; wret = del_ptr(trans, root, path, level + 1, pslot + 1); if (wret) ret = wret; - wret = btrfs_free_tree_block(trans, root, - bytenr, blocksize, 0, - root->root_key.objectid, - level); - if (wret) - ret = wret; + root_sub_used(root, right->len); + btrfs_free_tree_block(trans, root, right, 0, 1); + free_extent_buffer(right); + right = NULL; } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); @@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - /* we've managed to empty the middle node, drop it */ - u64 bytenr = mid->start; - u32 blocksize = mid->len; - clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - free_extent_buffer(mid); - mid = NULL; wret = del_ptr(trans, root, path, level + 1, pslot); if (wret) ret = wret; - wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, - 0, root->root_key.objectid, level); - if (wret) - ret = wret; + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); + free_extent_buffer(mid); + mid = NULL; } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; @@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_release_path(NULL, p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, gen); + tmp = read_tree_block(root, blocknr, blocksize, 0); if (tmp) { /* * If the read above didn't mark this buffer up to date, @@ -1740,7 +1754,6 @@ again: p->nodes[level + 1], p->slots[level + 1], &b); if (err) { - free_extent_buffer(b); ret = err; goto done; } @@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (IS_ERR(c)) return PTR_ERR(c); + root_add_used(root, root->nodesize); + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); @@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root int nritems; BUG_ON(!path->nodes[level]); + btrfs_assert_tree_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); @@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, if (IS_ERR(split)) return PTR_ERR(split); + root_add_used(root, root->nodesize); + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); @@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); + else + clean_tree_block(trans, root, left); + btrfs_mark_buffer_dirty(right); btrfs_item_key(right, &disk_key, 0); @@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(left); if (right_nritems) btrfs_mark_buffer_dirty(right); + else + clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); wret = fixup_low_keys(trans, root, path, &disk_key, 1); @@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = left; @@ -2932,10 +2953,10 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, &disk_key, 0, l->start, 0); - if (IS_ERR(right)) { - BUG_ON(1); + if (IS_ERR(right)) return PTR_ERR(right); - } + + root_add_used(root, root->leafsize); memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); @@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &key, path, ins_len, 1); - BUG_ON(ret); + if (ret) + goto err; path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); @@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, - 0, root->root_key.objectid, 0); - return ret; + root_sub_used(root, leaf->len); + + btrfs_free_tree_block(trans, root, leaf, 0, 1); + return 0; } /* * delete the item at the leaf level in path. If that empties @@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { + btrfs_set_path_blocking(path); + clean_tree_block(trans, root, leaf); ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 746a724..29c2009 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -34,6 +34,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; +struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; @@ -663,6 +664,7 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_NR_RAID_TYPES 5 struct btrfs_block_group_item { __le64 used; @@ -674,42 +676,46 @@ struct btrfs_space_info { u64 flags; u64 total_bytes; /* total bytes in the space */ - u64 bytes_used; /* total bytes used on disk */ + u64 bytes_used; /* total bytes used, + this does't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ u64 bytes_readonly; /* total bytes that are read only */ - u64 bytes_super; /* total bytes reserved for the super blocks */ - u64 bytes_root; /* the number of bytes needed to commit a - transaction */ + u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ - u64 bytes_delalloc; /* number of bytes currently reserved for - delayed allocation */ + u64 disk_used; /* total bytes used on disk */ int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ - int force_delalloc; /* make people start doing filemap_flush until - we're under a threshold */ struct list_head list; - /* for controlling how we free up space for allocations */ - wait_queue_head_t allocate_wait; - wait_queue_head_t flush_wait; - int allocating_chunk; - int flushing; - /* for block groups in our same type */ - struct list_head block_groups; + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; }; +struct btrfs_block_rsv { + u64 size; + u64 reserved; + u64 freed[2]; + struct btrfs_space_info *space_info; + struct list_head list; + spinlock_t lock; + atomic_t usage; + unsigned int priority:8; + unsigned int durable:1; + unsigned int refill_used:1; + unsigned int full:1; +}; + /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -760,6 +766,7 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; + u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; @@ -825,6 +832,22 @@ struct btrfs_fs_info { /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; + /* block reservation for extent, checksum and root tree */ + struct btrfs_block_rsv global_block_rsv; + /* block reservation for delay allocation */ + struct btrfs_block_rsv delalloc_block_rsv; + /* block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + /* list of block reservations that cross multiple transactions */ + struct list_head durable_block_rsv_list; + + struct mutex durable_block_rsv_mutex; + u64 generation; u64 last_trans_committed; @@ -927,7 +950,6 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; - struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -943,6 +965,7 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; + int enospc_unlink; u64 total_pinned; @@ -1012,6 +1035,9 @@ struct btrfs_root { struct completion kobj_unregister; struct mutex objectid_mutex; + spinlock_t accounting_lock; + struct btrfs_block_rsv *block_rsv; + struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; @@ -1043,7 +1069,6 @@ struct btrfs_root { int ref_cows; int track_dirty; int in_radix; - int clean_orphans; u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -1057,8 +1082,11 @@ struct btrfs_root { struct list_head root_list; - spinlock_t list_lock; + spinlock_t orphan_lock; struct list_head orphan_list; + struct btrfs_block_rsv *orphan_block_rsv; + int orphan_item_inserted; + int orphan_cleanup_state; spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ @@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, @@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); -int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group); - u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); - -int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); -int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes); -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items, int *retries); +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode); +void btrfs_orphan_release_metadata(struct inode *inode); +int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries); +int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes); +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); +struct btrfs_inode_ref * +btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 logical_offset, u32 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, @@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); +void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); /* file.c */ -int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); +int btrfs_sync_file(struct file *file, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); @@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 902ce50..e807b14 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -319,107 +319,6 @@ out: } /* - * helper function to lookup reference count and flags of extent. - * - * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. the head - * node may also store the extent flags to set. This way you can check - * to see what the reference count and extent flags would be if all of - * the delayed refs are not processed. - */ -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) -{ - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; - struct btrfs_key key; - u32 item_size; - u64 num_refs; - u64 extent_flags; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - delayed_refs = &trans->transaction->delayed_refs; -again: - ret = btrfs_search_slot(trans, root->fs_info->extent_root, - &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - num_refs = btrfs_extent_refs_v0(leaf, ei0); - /* FIXME: this isn't correct for data */ - extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; -#else - BUG(); -#endif - } - BUG_ON(num_refs == 0); - } else { - num_refs = 0; - extent_flags = 0; - ret = 0; - } - - spin_lock(&delayed_refs->lock); - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); - if (ref) { - head = btrfs_delayed_node_to_head(ref); - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(root->fs_info->extent_root, path); - - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; - } - if (head->extent_op && head->extent_op->update_flags) - extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); - - num_refs += ref->ref_mod; - mutex_unlock(&head->mutex); - } - WARN_ON(num_refs == 0); - if (refs) - *refs = num_refs; - if (flags) - *flags = extent_flags; -out: - spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); - return ret; -} - -/* * helper function to update an extent delayed ref in the * rbtree. existing and update must both have the same * bytenr and parent diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f6fc67d..50e3cf9 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags); int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index feca041..f3b287c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -74,6 +74,11 @@ struct async_submit_bio { int rw; int mirror_num; unsigned long bio_flags; + /* + * bio_offset is optional, can be used if the pages in the bio + * can't tell us where in the file the bio should go + */ + u64 bio_offset; struct btrfs_work work; }; @@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); fs_info = BTRFS_I(async->inode)->root->fs_info; async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_done(struct btrfs_work *work) @@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work) int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done) { @@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->work.flags = 0; async->bio_flags = bio_flags; + async->bio_offset = bio_offset; atomic_inc(&fs_info->nr_async_submits); @@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio) static int __btree_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, } static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { int ret; @@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, 0, + bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); } @@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->ref_cows = 0; root->track_dirty = 0; root->in_radix = 0; - root->clean_orphans = 0; + root->orphan_item_inserted = 0; + root->orphan_cleanup_state = 0; root->fs_info = fs_info; root->objectid = objectid; @@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->name = NULL; root->in_sysfs = 0; root->inode_tree = RB_ROOT; + root->block_rsv = NULL; + root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->node_lock); - spin_lock_init(&root->list_lock); + spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); + spin_lock_init(&root->accounting_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct extent_buffer *eb; - struct btrfs_root *log_root_tree = fs_info->log_root_tree; - u64 start = 0; - u64 end = 0; - int ret; - - if (!log_root_tree) - return 0; - - while (1) { - ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); - if (ret) - break; - - clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); - } - eb = fs_info->log_root_tree->node; - - WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) != 0); - - ret = btrfs_free_reserved_extent(fs_info->tree_root, - eb->start, eb->len); - BUG_ON(ret); - - free_extent_buffer(eb); - kfree(fs_info->log_root_tree); - fs_info->log_root_tree = NULL; - return 0; -} - static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1191,19 +1172,23 @@ again: if (root) return root; - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); - if (ret == 0) - ret = -ENOENT; - if (ret < 0) - return ERR_PTR(ret); - root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); if (IS_ERR(root)) return root; - WARN_ON(btrfs_root_refs(&root->root_item) == 0); set_anon_super(&root->anon_super, NULL); + if (btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; + goto fail; + } + + ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + if (ret < 0) + goto fail; + if (ret == 0) + root->orphan_item_inserted = 1; + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto fail; @@ -1212,10 +1197,9 @@ again: ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) { + if (ret == 0) root->in_radix = 1; - root->clean_orphans = 1; - } + spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); if (ret) { @@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { - smp_mb(); - if (root->fs_info->closing) - break; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); if (!(root->fs_info->sb->s_flags & MS_RDONLY) && @@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg) if (freezing(current)) { refrigerator(); } else { - smp_mb(); - if (root->fs_info->closing) - break; set_current_state(TASK_INTERRUPTIBLE); - schedule(); + if (!kthread_should_stop()) + schedule(); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; + u64 transid; unsigned long now; unsigned long delay; int ret; do { - smp_mb(); - if (root->fs_info->closing) - break; - delay = HZ * 30; vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->new_trans_lock); cur = root->fs_info->running_transaction; if (!cur) { - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->new_trans_lock); goto sleep; } now = get_seconds(); - if (now < cur->start_time || now - cur->start_time < 30) { - mutex_unlock(&root->fs_info->trans_mutex); + if (!cur->blocked && + (now < cur->start_time || now - cur->start_time < 30)) { + spin_unlock(&root->fs_info->new_trans_lock); delay = HZ * 5; goto sleep; } - mutex_unlock(&root->fs_info->trans_mutex); - trans = btrfs_start_transaction(root, 1); - ret = btrfs_commit_transaction(trans, root); + transid = cur->transid; + spin_unlock(&root->fs_info->new_trans_lock); + trans = btrfs_join_transaction(root, 1); + if (transid == trans->transid) { + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + } else { + btrfs_end_transaction(trans, root); + } sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); @@ -1530,10 +1512,10 @@ sleep: if (freezing(current)) { refrigerator(); } else { - if (root->fs_info->closing) - break; set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(delay); + if (!kthread_should_stop() && + !btrfs_transaction_blocked(root->fs_info)) + schedule_timeout(delay); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); + btrfs_init_block_rsv(&fs_info->global_block_rsv); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); + btrfs_init_block_rsv(&fs_info->trans_block_rsv); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv); + btrfs_init_block_rsv(&fs_info->empty_block_rsv); + INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); + mutex_init(&fs_info->durable_block_rsv_mutex); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), &fs_info->generic_worker); - btrfs_init_workers(&fs_info->enospc_workers, "enospc", - fs_info->thread_pool_size, - &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); - btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb, csum_root->track_dirty = 1; + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + ret = btrfs_read_block_groups(extent_root); if (ret) { printk(KERN_ERR "Failed to read block groups: %d\n", ret); goto fail_block_groups; } - fs_info->generation = generation; - fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) @@ -1977,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, BUG_ON(ret); if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_fs_roots(fs_info); + BUG_ON(ret); + ret = btrfs_recover_relocation(tree_root); if (ret < 0) { printk(KERN_WARNING @@ -2040,7 +2029,6 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2405,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root) down_write(&root->fs_info->cleanup_work_sem); up_write(&root->fs_info->cleanup_work_sem); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_join_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ - trans = btrfs_start_transaction(root, 1); + trans = btrfs_join_transaction(root, 1); btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); @@ -2426,15 +2414,15 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); - kthread_stop(root->fs_info->transaction_kthread); - kthread_stop(root->fs_info->cleaner_kthread); - if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + fs_info->closing = 2; smp_mb(); @@ -2473,7 +2461,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c958ecb..88e825a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags, + unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); @@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c6a4f45..b9080d7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,10 +35,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free); -static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 bytenr, u64 num_bytes, int alloc); +static int update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, @@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache) void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { - if (atomic_dec_and_test(&cache->count)) + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + WARN_ON(cache->reserved_pinned > 0); kfree(cache); + } } /* @@ -319,7 +316,7 @@ static int caching_kthread(void *data) exclude_super_stripes(extent_root, block_group); spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_super += block_group->bytes_super; + block_group->space_info->bytes_readonly += block_group->bytes_super; spin_unlock(&block_group->space_info->lock); last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; + flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA; + rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags == flags) { @@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) } /* + * helper function to lookup reference count and flags of extent. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out_free; + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + if (!trans) + goto out; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += head->node.ref_mod; + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); +out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out_free: + btrfs_free_path(path); + return ret; +} + +/* * Back reference rules. Back refs have three main goals: * * 1) differentiate between all holders of references to an extent so that @@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, return ret; } - /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); if (insert_reserved) { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - - ret = pin_down_bytes(trans, root, NULL, - node->bytenr, node->num_bytes, - head->is_data, 1, &must_clean); - if (ret > 0) - mark_free = 1; - - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); BUG_ON(ret); } - if (mark_free) { - ret = btrfs_free_reserved_extent(root, - node->bytenr, - node->num_bytes); - BUG_ON(ret); - } } mutex_unlock(&head->mutex); return 0; @@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, ret = 0; out: btrfs_free_path(path); + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + WARN_ON(ret > 0); return ret; } @@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; + int i; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; found = __find_space_info(info, flags); if (found) { spin_lock(&found->lock); found->total_bytes += total_bytes; found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; found->full = 0; spin_unlock(&found->lock); *space_info = found; @@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - INIT_LIST_HEAD(&found->block_groups); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); - init_waitqueue_head(&found->flush_wait); - init_waitqueue_head(&found->allocate_wait); spin_lock_init(&found->lock); - found->flags = flags; + found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | + BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA); found->total_bytes = total_bytes; found->bytes_used = bytes_used; + found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; found->bytes_readonly = 0; - found->bytes_delalloc = 0; + found->bytes_may_use = 0; found->full = 0; found->force_alloc = 0; *space_info = found; @@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } } -static void set_block_group_readonly(struct btrfs_block_group_cache *cache) -{ - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (!cache->ro) { - cache->space_info->bytes_readonly += cache->key.offset - - btrfs_block_group_used(&cache->item); - cache->ro = 1; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); -} - u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; @@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return flags; } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) -{ - struct btrfs_fs_info *info = root->fs_info; - u64 alloc_profile; - - if (data) { - alloc_profile = info->avail_data_alloc_bits & - info->data_alloc_profile; - data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; - } else if (root == root->fs_info->chunk_root) { - alloc_profile = info->avail_system_alloc_bits & - info->system_alloc_profile; - data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; - } else { - alloc_profile = info->avail_metadata_alloc_bits & - info->metadata_alloc_profile; - data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; - } - - return btrfs_reduce_alloc_profile(root, data); -} - -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - u64 alloc_target; - - alloc_target = btrfs_get_alloc_profile(root, 1); - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - alloc_target); -} - -static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) -{ - u64 num_bytes; - int level; - - level = BTRFS_MAX_LEVEL - 2; - /* - * NOTE: these calculations are absolutely the worst possible case. - * This assumes that _every_ item we insert will require a new leaf, and - * that the tree has grown to its maximum level size. - */ - - /* - * for every item we insert we could insert both an extent item and a - * extent ref item. Then for ever item we insert, we will need to cow - * both the original leaf, plus the leaf to the left and right of it. - * - * Unless we are talking about the extent root, then we just want the - * number of items * 2, since we just need the extent item plus its ref. - */ - if (root == root->fs_info->extent_root) - num_bytes = num_items * 2; - else - num_bytes = (num_items + (2 * num_items)) * 3; - - /* - * num_bytes is total number of leaves we could need times the leaf - * size, and then for every leaf we could end up cow'ing 2 nodes per - * level, down to the leaf level. - */ - num_bytes = (num_bytes * root->leafsize) + - (num_bytes * (level * 2)) * root->nodesize; - - return num_bytes; -} - -/* - * Unreserve metadata space for delalloc. If we have less reserved credits than - * we have extents, this function does nothing. - */ -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); - - spin_lock(&meta_sinfo->lock); - spin_lock(&BTRFS_I(inode)->accounting_lock); - if (BTRFS_I(inode)->reserved_extents <= - BTRFS_I(inode)->outstanding_extents) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - spin_unlock(&meta_sinfo->lock); - return 0; - } - spin_unlock(&BTRFS_I(inode)->accounting_lock); - - BTRFS_I(inode)->reserved_extents -= num_items; - BUG_ON(BTRFS_I(inode)->reserved_extents < 0); - - if (meta_sinfo->bytes_delalloc < num_bytes) { - bug = true; - meta_sinfo->bytes_delalloc = 0; - } else { - meta_sinfo->bytes_delalloc -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); - - BUG_ON(bug); - - return 0; -} - -static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - u64 thresh; - - thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use; - - thresh = meta_sinfo->total_bytes - thresh; - thresh *= 80; - do_div(thresh, 100); - if (thresh <= meta_sinfo->bytes_delalloc) - meta_sinfo->force_delalloc = 1; - else - meta_sinfo->force_delalloc = 0; + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits & + root->fs_info->data_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits & + root->fs_info->system_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits & + root->fs_info->metadata_alloc_profile; + return btrfs_reduce_alloc_profile(root, flags); } -struct async_flush { - struct btrfs_root *root; - struct btrfs_space_info *info; - struct btrfs_work work; -}; - -static noinline void flush_delalloc_async(struct btrfs_work *work) +static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { - struct async_flush *async; - struct btrfs_root *root; - struct btrfs_space_info *info; - - async = container_of(work, struct async_flush, work); - root = async->root; - info = async->info; - - btrfs_start_delalloc_inodes(root, 0); - wake_up(&info->flush_wait); - btrfs_wait_ordered_extents(root, 0, 0); - - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); - - kfree(async); -} - -static void wait_on_flush(struct btrfs_space_info *info) -{ - DEFINE_WAIT(wait); - u64 used; - - while (1) { - prepare_to_wait(&info->flush_wait, &wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&info->lock); - if (!info->flushing) { - spin_unlock(&info->lock); - break; - } - - used = info->bytes_used + info->bytes_reserved + - info->bytes_pinned + info->bytes_readonly + - info->bytes_super + info->bytes_root + - info->bytes_may_use + info->bytes_delalloc; - if (used < info->total_bytes) { - spin_unlock(&info->lock); - break; - } - spin_unlock(&info->lock); - schedule(); - } - finish_wait(&info->flush_wait, &wait); -} - -static void flush_delalloc(struct btrfs_root *root, - struct btrfs_space_info *info) -{ - struct async_flush *async; - bool wait = false; - - spin_lock(&info->lock); + u64 flags; - if (!info->flushing) - info->flushing = 1; + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == root->fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; else - wait = true; - - spin_unlock(&info->lock); - - if (wait) { - wait_on_flush(info); - return; - } - - async = kzalloc(sizeof(*async), GFP_NOFS); - if (!async) - goto flush; - - async->root = root; - async->info = info; - async->work.func = flush_delalloc_async; + flags = BTRFS_BLOCK_GROUP_METADATA; - btrfs_queue_worker(&root->fs_info->enospc_workers, - &async->work); - wait_on_flush(info); - return; - -flush: - btrfs_start_delalloc_inodes(root, 0); - btrfs_wait_ordered_extents(root, 0, 0); - - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); + return get_alloc_profile(root, flags); } -static int maybe_allocate_chunk(struct btrfs_root *root, - struct btrfs_space_info *info) -{ - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; - struct btrfs_trans_handle *trans; - bool wait = false; - int ret = 0; - u64 min_metadata; - u64 free_space; - - free_space = btrfs_super_total_bytes(disk_super); - /* - * we allow the metadata to grow to a max of either 10gb or 5% of the - * space in the volume. - */ - min_metadata = min((u64)10 * 1024 * 1024 * 1024, - div64_u64(free_space * 5, 100)); - if (info->total_bytes >= min_metadata) { - spin_unlock(&info->lock); - return 0; - } - - if (info->full) { - spin_unlock(&info->lock); - return 0; - } - - if (!info->allocating_chunk) { - info->force_alloc = 1; - info->allocating_chunk = 1; - } else { - wait = true; - } - - spin_unlock(&info->lock); - - if (wait) { - wait_event(info->allocate_wait, - !info->allocating_chunk); - return 1; - } - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 4096 + 2 * 1024 * 1024, - info->flags, 0); - btrfs_end_transaction(trans, root); - if (ret) - goto out; -out: - spin_lock(&info->lock); - info->allocating_chunk = 0; - spin_unlock(&info->lock); - wake_up(&info->allocate_wait); - - if (ret) - return 0; - return 1; -} - -/* - * Reserve metadata space for delalloc. - */ -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int flushed = 0; - int force_delalloc; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); -again: - spin_lock(&meta_sinfo->lock); - - force_delalloc = meta_sinfo->force_delalloc; - - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); - - if (!flushed) - meta_sinfo->bytes_delalloc += num_bytes; - - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; - - if (used > meta_sinfo->total_bytes) { - flushed++; - - if (flushed == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - flushed++; - } else { - spin_unlock(&meta_sinfo->lock); - } - - if (flushed == 2) { - filemap_flush(inode->i_mapping); - goto again; - } else if (flushed == 3) { - flush_delalloc(root, meta_sinfo); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_delalloc -= num_bytes; - spin_unlock(&meta_sinfo->lock); - printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->outstanding_extents, - BTRFS_I(inode)->reserved_extents); - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; - } - - BTRFS_I(inode)->reserved_extents += num_items; - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); - - if (!flushed && force_delalloc) - filemap_flush(inode->i_mapping); - - return 0; -} - -/* - * unreserve num_items number of items worth of metadata space. This needs to - * be paired with btrfs_reserve_metadata_space. - * - * NOTE: if you have the option, run this _AFTER_ you do a - * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref - * oprations which will result in more used metadata, so we want to make sure we - * can do that without issue. - */ -int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root, num_items); - - spin_lock(&meta_sinfo->lock); - if (meta_sinfo->bytes_may_use < num_bytes) { - bug = true; - meta_sinfo->bytes_may_use = 0; - } else { - meta_sinfo->bytes_may_use -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); - - BUG_ON(bug); - - return 0; -} - -/* - * Reserve some metadata space for use. We'll calculate the worste case number - * of bytes that would be needed to modify num_items number of items. If we - * have space, fantastic, if not, you get -ENOSPC. Please call - * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of - * items you reserved, since whatever metadata you needed should have already - * been allocated. - * - * This will commit the transaction to make more space if we don't have enough - * metadata space. THe only time we don't do this is if we're reserving space - * inside of a transaction, then we will just return -ENOSPC and it is the - * callers responsibility to handle it properly. - */ -int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int retries = 0; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root, num_items); -again: - spin_lock(&meta_sinfo->lock); - - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); - - if (!retries) - meta_sinfo->bytes_may_use += num_bytes; - - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; - - if (used > meta_sinfo->total_bytes) { - retries++; - if (retries == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - retries++; - } else { - spin_unlock(&meta_sinfo->lock); - } - - if (retries == 2) { - flush_delalloc(root, meta_sinfo); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_may_use -= num_bytes; - spin_unlock(&meta_sinfo->lock); - - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; - } - - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); - - return 0; + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_DATA); } /* * This will check the space that the inode allocates from to make sure we have * enough space for bytes. */ -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 used; - int ret = 0, committed = 0, flushed = 0; + int ret = 0, committed = 0; /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); @@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); - used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + - data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + - data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + - data_sinfo->bytes_super; + used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + + data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + + data_sinfo->bytes_may_use; if (used + bytes > data_sinfo->total_bytes) { struct btrfs_trans_handle *trans; - if (!flushed) { - spin_unlock(&data_sinfo->lock); - flush_delalloc(root, data_sinfo); - flushed = 1; - goto again; - } - /* * if we don't have enough free bytes in this space then we need * to alloc a new chunk. @@ -3274,15 +2913,15 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = do_chunk_alloc(trans, root->fs_info->extent_root, bytes + 2 * 1024 * 1024, alloc_target, 0); btrfs_end_transaction(trans, root); - if (ret) + if (ret < 0) return ret; if (!data_sinfo) { @@ -3297,25 +2936,26 @@ alloc: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); if (ret) return ret; goto again; } - printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" - ", %llu bytes_used, %llu bytes_reserved, " - "%llu bytes_pinned, %llu bytes_readonly, %llu may use " - "%llu total\n", (unsigned long long)bytes, - (unsigned long long)data_sinfo->bytes_delalloc, +#if 0 /* I hope we never need this code again, just in case */ + printk(KERN_ERR "no space left, need %llu, %llu bytes_used, " + "%llu bytes_reserved, " "%llu bytes_pinned, " + "%llu bytes_readonly, %llu may use %llu total\n", + (unsigned long long)bytes, (unsigned long long)data_sinfo->bytes_used, (unsigned long long)data_sinfo->bytes_reserved, (unsigned long long)data_sinfo->bytes_pinned, (unsigned long long)data_sinfo->bytes_readonly, (unsigned long long)data_sinfo->bytes_may_use, (unsigned long long)data_sinfo->total_bytes); +#endif return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -3326,12 +2966,13 @@ alloc: } /* - * if there was an error for whatever reason after calling - * btrfs_check_data_free_space, call this so we can cleanup the counters. + * called when we are clearing an delalloc extent from the + * inode's io_tree or there was an error for whatever reason + * after calling btrfs_check_data_free_space */ -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ @@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root, spin_unlock(&data_sinfo->lock); } -/* called when we are adding a delalloc extent to the inode's io_tree */ -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) -{ - struct btrfs_space_info *data_sinfo; - - /* get the space info for where this inode will be storing its data */ - data_sinfo = BTRFS_I(inode)->space_info; - - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_delalloc += bytes; - - /* - * we are adding a delalloc extent without calling - * btrfs_check_data_free_space first. This happens on a weird - * writepage condition, but shouldn't hurt our accounting - */ - if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { - data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; - BTRFS_I(inode)->reserved_bytes = 0; - } else { - data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; - } - - spin_unlock(&data_sinfo->lock); -} - -/* called when we are clearing an delalloc extent from the inode's io_tree */ -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) -{ - struct btrfs_space_info *info; - - info = BTRFS_I(inode)->space_info; - - spin_lock(&info->lock); - info->bytes_delalloc -= bytes; - spin_unlock(&info->lock); -} - static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } +static int should_alloc_chunk(struct btrfs_space_info *sinfo, + u64 alloc_bytes) +{ + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes + 256 * 1024 * 1024 < num_bytes) + return 0; + + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes < div_factor(num_bytes, 8)) + return 0; + + return 1; +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; - u64 thresh; int ret = 0; mutex_lock(&fs_info->chunk_mutex); @@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } - thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 8); - if (!force && - (space_info->bytes_used + space_info->bytes_pinned + - space_info->bytes_reserved + alloc_bytes) < thresh) { + if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, spin_lock(&space_info->lock); if (ret) space_info->full = 1; + else + ret = 1; space_info->force_alloc = 0; spin_unlock(&space_info->lock); out: @@ -3461,13 +3073,713 @@ out: return ret; } +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 num_bytes) +{ + int ret; + int end_trans = 0; + + if (sinfo->full) + return 0; + + spin_lock(&sinfo->lock); + ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); + spin_unlock(&sinfo->lock); + if (!ret) + return 0; + + if (!trans) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + end_trans = 1; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, + get_alloc_profile(root, sinfo->flags), 0); + + if (end_trans) + btrfs_end_transaction(trans, root); + + return ret == 1 ? 1 : 0; +} + +/* + * shrink metadata reservation for delalloc + */ +static int shrink_delalloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 to_reclaim) +{ + struct btrfs_block_rsv *block_rsv; + u64 reserved; + u64 max_reclaim; + u64 reclaimed = 0; + int pause = 1; + int ret; + + block_rsv = &root->fs_info->delalloc_block_rsv; + spin_lock(&block_rsv->lock); + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (reserved == 0) + return 0; + + max_reclaim = min(reserved, to_reclaim); + + while (1) { + ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); + if (!ret) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(pause); + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; + } else { + pause = 1; + } + + spin_lock(&block_rsv->lock); + if (reserved > block_rsv->reserved) + reclaimed = reserved - block_rsv->reserved; + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (reserved == 0 || reclaimed >= max_reclaim) + break; + + if (trans && trans->transaction->blocked) + return -EAGAIN; + } + return reclaimed >= to_reclaim; +} + +static int should_retry_reserve(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + int ret; + + if ((*retries) > 2) + return -ENOSPC; + + ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); + if (ret) + return 1; + + if (trans && trans->transaction->in_commit) + return -ENOSPC; + + ret = shrink_delalloc(trans, root, num_bytes); + if (ret) + return ret; + + spin_lock(&space_info->lock); + if (space_info->bytes_pinned < num_bytes) + ret = 1; + spin_unlock(&space_info->lock); + if (ret) + return -ENOSPC; + + (*retries)++; + + if (trans) + return -EAGAIN; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + + return 1; +} + +static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 unused; + int ret = -ENOSPC; + + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly; + + if (unused < space_info->total_bytes) + unused = space_info->total_bytes - unused; + else + unused = 0; + + if (unused >= num_bytes) { + if (block_rsv->priority >= 10) { + space_info->bytes_reserved += num_bytes; + ret = 0; + } else { + if ((unused + block_rsv->reserved) * + block_rsv->priority >= + (num_bytes + block_rsv->reserved) * 10) { + space_info->bytes_reserved += num_bytes; + ret = 0; + } + } + } + spin_unlock(&space_info->lock); + + return ret; +} + +static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + if (root->ref_cows) + block_rsv = trans->block_rsv; + else + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &root->fs_info->empty_block_rsv; + + return block_rsv; +} + +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret = -ENOSPC; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) + num_bytes = block_rsv->size; + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (num_bytes > 0) { + if (dest) { + block_rsv_add_bytes(dest, num_bytes, 0); + } else { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= num_bytes; + spin_unlock(&space_info->lock); + } + } +} + +static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes) +{ + int ret; + + ret = block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + block_rsv_add_bytes(dst, num_bytes, 1); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + atomic_set(&rsv->usage, 1); + rsv->priority = 6; + INIT_LIST_HEAD(&rsv->list); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 alloc_target; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_block_rsv(block_rsv); + + alloc_target = btrfs_get_alloc_profile(root, 0); + block_rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + if (rsv && atomic_dec_and_test(&rsv->usage)) { + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (!rsv->durable) + kfree(rsv); + } +} + +/* + * make the block_rsv struct be able to capture freed space. + * the captured space will re-add to the the block_rsv struct + * after transaction commit + */ +void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv) +{ + block_rsv->durable = 1; + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); + mutex_unlock(&fs_info->durable_block_rsv_mutex); +} + +int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries) +{ + int ret; + + if (num_bytes == 0) + return 0; +again: + ret = reserve_metadata_bytes(block_rsv, num_bytes); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } + + ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); + if (ret > 0) + goto again; + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor) +{ + u64 num_bytes = 0; + int commit_trans = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + if (min_factor > 0) + num_bytes = div_factor(block_rsv->size, min_factor); + if (min_reserved > num_bytes) + num_bytes = min_reserved; + + if (block_rsv->reserved >= num_bytes) { + ret = 0; + } else { + num_bytes -= block_rsv->reserved; + if (block_rsv->durable && + block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) + commit_trans = 1; + } + spin_unlock(&block_rsv->lock); + if (!ret) + return 0; + + if (block_rsv->refill_used) { + ret = reserve_metadata_bytes(block_rsv, num_bytes); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; + } + } + + if (commit_trans) { + if (trans) + return -EAGAIN; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + return 0; + } + + WARN_ON(1); + printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", + block_rsv->size, block_rsv->reserved, + block_rsv->freed[0], block_rsv->freed[1]); + + return -ENOSPC; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes) +{ + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + if (global_rsv->full || global_rsv == block_rsv || + block_rsv->space_info != global_rsv->space_info) + global_rsv = NULL; + block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); +} + +/* + * helper to calculate size of global block reservation. + * the desired value is sum of space used by extent tree, + * checksum tree and root tree + */ +static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *sinfo; + u64 num_bytes; + u64 meta_used; + u64 data_used; + int csum_size = btrfs_super_csum_size(&fs_info->super_copy); +#if 0 + /* + * per tree used space accounting can be inaccuracy, so we + * can't rely on it. + */ + spin_lock(&fs_info->extent_root->accounting_lock); + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item); + spin_unlock(&fs_info->extent_root->accounting_lock); + + spin_lock(&fs_info->csum_root->accounting_lock); + num_bytes += btrfs_root_used(&fs_info->csum_root->root_item); + spin_unlock(&fs_info->csum_root->accounting_lock); + + spin_lock(&fs_info->tree_root->accounting_lock); + num_bytes += btrfs_root_used(&fs_info->tree_root->root_item); + spin_unlock(&fs_info->tree_root->accounting_lock); +#endif + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + spin_lock(&sinfo->lock); + data_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&sinfo->lock); + meta_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * + csum_size * 2; + num_bytes += div64_u64(data_used + meta_used, 50); + + if (num_bytes * 3 > meta_used) + num_bytes = div64_u64(meta_used, 3); + + return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); +} + +static void update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + + num_bytes = calc_global_metadata_size(fs_info); + + spin_lock(&block_rsv->lock); + spin_lock(&sinfo->lock); + + block_rsv->size = num_bytes; + + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + + sinfo->bytes_reserved + sinfo->bytes_readonly; + + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + block_rsv->reserved += num_bytes; + sinfo->bytes_reserved += num_bytes; + } + + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + sinfo->bytes_reserved -= num_bytes; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } +#if 0 + printk(KERN_INFO"global block rsv size %llu reserved %llu\n", + block_rsv->size, block_rsv->reserved); +#endif + spin_unlock(&sinfo->lock); + spin_unlock(&block_rsv->lock); +} + +static void init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + fs_info->chunk_block_rsv.priority = 10; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->global_block_rsv.priority = 10; + fs_info->global_block_rsv.refill_used = 1; + fs_info->delalloc_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.priority = 10; + + fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; + fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); + + btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); + + update_global_block_rsv(fs_info); +} + +static void release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + WARN_ON(fs_info->delalloc_block_rsv.size > 0); + WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); +} + +static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + 3 * num_items; +} + +int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items, int *retries) +{ + u64 num_bytes; + int ret; + + if (num_items == 0 || root->fs_info->chunk_root == root) + return 0; + + num_bytes = calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, + num_bytes, retries); + if (!ret) { + trans->bytes_reserved += num_bytes; + trans->block_rsv = &root->fs_info->trans_block_rsv; + } + return ret; +} + +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans->bytes_reserved) + return; + + BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->bytes_reserved = 0; +} + +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; + + /* + * one for deleting orphan item, one for updating inode and + * two for calling btrfs_truncate_inode_items. + * + * btrfs_truncate_inode_items is a delete operation, it frees + * more space than it uses in most cases. So two units of + * metadata space should be enough for calling it many times. + * If all of the metadata space is used, we can commit + * transaction and use space it freed. + */ + u64 num_bytes = calc_trans_metadata_size(root, 4); + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_orphan_release_metadata(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 num_bytes = calc_trans_metadata_size(root, 4); + btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); +} + +int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; + /* + * two for root back/forward refs, two for directory entries + * and one for root of the snapshot. + */ + u64 num_bytes = calc_trans_metadata_size(root, 5); + dst_rsv->space_info = src_rsv->space_info; + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +{ + return num_bytes >>= 3; +} + +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + u64 to_reserve; + int nr_extents; + int retries = 0; + int ret; + + if (btrfs_transaction_in_commit(root->fs_info)) + schedule_timeout(1); + + num_bytes = ALIGN(num_bytes, root->sectorsize); +again: + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; + if (nr_extents > BTRFS_I(inode)->reserved_extents) { + nr_extents -= BTRFS_I(inode)->reserved_extents; + to_reserve = calc_trans_metadata_size(root, nr_extents); + } else { + nr_extents = 0; + to_reserve = 0; + } + + to_reserve += calc_csum_metadata_size(inode, num_bytes); + ret = reserve_metadata_bytes(block_rsv, to_reserve); + if (ret) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, + &retries); + if (ret > 0) + goto again; + return ret; + } + + BTRFS_I(inode)->reserved_extents += nr_extents; + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + block_rsv_add_bytes(block_rsv, to_reserve, 1); + + if (block_rsv->size > 512 * 1024 * 1024) + shrink_delalloc(NULL, root, to_reserve); + + return 0; +} + +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 to_free; + int nr_extents; + + num_bytes = ALIGN(num_bytes, root->sectorsize); + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents < BTRFS_I(inode)->reserved_extents) { + nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; + BTRFS_I(inode)->reserved_extents -= nr_extents; + } else { + nr_extents = 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + to_free = calc_csum_metadata_size(inode, num_bytes); + if (nr_extents > 0) + to_free += calc_trans_metadata_size(root, nr_extents); + + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, + to_free); +} + +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, num_bytes); + if (ret) + return ret; + + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); + if (ret) { + btrfs_free_reserved_data_space(inode, num_bytes); + return ret; + } + + return 0; +} + +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +{ + btrfs_delalloc_release_metadata(inode, num_bytes); + btrfs_free_reserved_data_space(inode, num_bytes); +} + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free) + u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; + int factor; u64 total = num_bytes; u64 old_val; u64 byte_in_group; @@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(info, bytenr); if (!cache) return -1; + if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; cache->space_info->bytes_reserved -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; - cache->space_info->bytes_used -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - if (mark_free) { - int ret; - ret = btrfs_discard_extent(root, bytenr, - num_bytes); - WARN_ON(ret); - - ret = btrfs_add_free_space(cache, bytenr, - num_bytes); - WARN_ON(ret); - } + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); } btrfs_put_block_group(cache); total -= num_bytes; @@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return bytenr; } -/* - * this function must be called within transaction - */ -int btrfs_pin_extent(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int reserved) +static int pin_down_extent(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; @@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - btrfs_put_block_group(cache); + set_extent_dirty(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + return 0; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + pin_down_extent(root, cache, bytenr, num_bytes, reserved); - set_extent_dirty(fs_info->pinned_extents, - bytenr, bytenr + num_bytes - 1, GFP_NOFS); + btrfs_put_block_group(cache); return 0; } -static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) +/* + * update size of reserved extents. this function may return -EAGAIN + * if 'reserve' is true or 'sinfo' is false. + */ +static int update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - cache->reserved += num_bytes; - cache->space_info->bytes_reserved += num_bytes; + int ret = 0; + if (sinfo) { + struct btrfs_space_info *space_info = cache->space_info; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + } + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); } else { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + } else { + if (reserve) + cache->reserved += num_bytes; + else + cache->reserved -= num_bytes; + } + spin_unlock(&cache->lock); } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - return 0; + return ret; } int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, @@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, fs_info->pinned_extents = &fs_info->freed_extents[0]; up_write(&fs_info->extent_commit_sem); + + update_global_block_rsv(fs_info); return 0; } @@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) btrfs_add_free_space(cache, start, len); } + start += len; + spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; + if (cache->ro) { + cache->space_info->bytes_readonly += len; + } else if (cache->reserved_pinned > 0) { + len = min(len, cache->reserved_pinned); + cache->reserved_pinned -= len; + cache->space_info->bytes_reserved += len; + } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - - start += len; } if (cache) @@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *next_rsv; u64 start; u64 end; + int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - return ret; -} + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_for_each_entry_safe(block_rsv, next_rsv, + &fs_info->durable_block_rsv_list, list) { -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean) -{ - int err = 0; - struct extent_buffer *buf; - - if (is_data) - goto pinit; - - /* - * discard is sloooow, and so triggering discards on - * individual btree blocks isn't a good plan. Just - * pin everything in discard mode. - */ - if (btrfs_test_opt(root, DISCARD)) - goto pinit; - - buf = btrfs_find_tree_block(root, bytenr, num_bytes); - if (!buf) - goto pinit; + idx = trans->transid & 0x1; + if (block_rsv->freed[idx] > 0) { + block_rsv_add_bytes(block_rsv, + block_rsv->freed[idx], 0); + block_rsv->freed[idx] = 0; + } + if (atomic_read(&block_rsv->usage) == 0) { + btrfs_block_rsv_release(root, block_rsv, (u64)-1); - /* we can reuse a block if it hasn't been written - * and it is from this transaction. We can't - * reuse anything from the tree log root because - * it has tiny sub-transactions. - */ - if (btrfs_buffer_uptodate(buf, 0) && - btrfs_try_tree_lock(buf)) { - u64 header_owner = btrfs_header_owner(buf); - u64 header_transid = btrfs_header_generation(buf); - if (header_owner != BTRFS_TREE_LOG_OBJECTID && - header_transid == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - *must_clean = buf; - return 1; + if (block_rsv->freed[0] == 0 && + block_rsv->freed[1] == 0) { + list_del_init(&block_rsv->list); + kfree(block_rsv); + } + } else { + btrfs_block_rsv_release(root, block_rsv, 0); } - btrfs_tree_unlock(buf); } - free_extent_buffer(buf); -pinit: - if (path) - btrfs_set_path_blocking(path); - /* unlocks the pinned mutex */ - btrfs_pin_extent(root, bytenr, num_bytes, reserved); + mutex_unlock(&fs_info->durable_block_rsv_mutex); - BUG_ON(err < 0); return 0; } @@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } } else { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - if (found_extent) { BUG_ON(is_data && refs_to_drop != extent_data_ref_count(root, path, iref)); @@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, 0, &must_clean); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); - /* - * it is going to be very rare for someone to be waiting - * on the block we're freeing. del_items might need to - * schedule, so rather than get fancy, just force it - * to blocking here - */ - if (must_clean) - btrfs_set_lock_blocking(must_clean); - ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); btrfs_release_path(extent_root, path); - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } - if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); @@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } - ret = update_block_group(trans, root, bytenr, num_bytes, 0, - mark_free); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); BUG_ON(ret); } btrfs_free_path(path); @@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } /* - * when we free an extent, it is possible (and likely) that we free the last + * when we free an block, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it * removes it from the tree. @@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct rb_node *node; - int ret; + int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -4024,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, list_del_init(&head->cluster); spin_unlock(&delayed_refs->lock); - ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->extent_op, - head->must_insert_reserved); - BUG_ON(ret); + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; + + mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - return 0; + return ret; out: spin_unlock(&delayed_refs->lock); return 0; } +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_group_cache *cache = NULL; + int ret; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } + + if (!last_ref) + return; + + block_rsv = get_block_rsv(trans, root); + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + BUG_ON(block_rsv->space_info != cache->space_info); + + if (btrfs_header_generation(buf) == trans->transid) { + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, root, buf->start); + if (!ret) + goto pin; + } + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(root, cache, buf->start, buf->len, 1); + goto pin; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + ret = update_reserved_bytes(cache, buf->len, 0, 0); + if (ret == -EAGAIN) { + /* block group became read-only */ + update_reserved_bytes(cache, buf->len, 0, 1); + goto out; + } + + ret = 1; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + block_rsv->reserved += buf->len; + ret = 0; + } + spin_unlock(&block_rsv->lock); + + if (ret) { + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_reserved -= buf->len; + spin_unlock(&cache->space_info->lock); + } + goto out; + } +pin: + if (block_rsv->durable && !cache->ro) { + ret = 0; + spin_lock(&cache->lock); + if (!cache->ro) { + cache->reserved_pinned += buf->len; + ret = 1; + } + spin_unlock(&cache->lock); + + if (ret) { + spin_lock(&block_rsv->lock); + block_rsv->freed[trans->transid & 0x1] += buf->len; + spin_unlock(&block_rsv->lock); + } + } +out: + btrfs_put_block_group(cache); +} + int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -4056,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); - ret = check_ref_cleanup(trans, root, bytenr); - BUG_ON(ret); } else { ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, parent, root_objectid, owner, @@ -4067,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, return ret; } -int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level) -{ - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) - blocksize; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - - return btrfs_free_extent(trans, root, bytenr, blocksize, - parent, root_objectid, level, 0); -} - static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@ -4134,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } +static int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + int index; + if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) + index = 0; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) + index = 1; + else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) + index = 2; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) + index = 3; + else + index = 4; + return index; +} + enum btrfs_loop_type { LOOP_FIND_IDEAL = 0, LOOP_CACHING_NOWAIT = 1, @@ -4155,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, u64 num_bytes, u64 empty_size, u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - u64 exclude_start, u64 exclude_nr, int data) { int ret = 0; @@ -4168,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; + int index = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; @@ -4237,6 +4620,7 @@ ideal_cache: btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { + index = get_block_group_index(block_group); goto have_block_group; } } else if (block_group) { @@ -4245,7 +4629,8 @@ ideal_cache: } search: down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups, list) { + list_for_each_entry(block_group, &space_info->block_groups[index], + list) { u64 offset; int cached; @@ -4436,23 +4821,22 @@ checks: goto loop; } - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + ret = update_reserved_bytes(block_group, num_bytes, 1, + (data & BTRFS_BLOCK_GROUP_DATA)); + if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= block_group->key.objectid && - search_start < (block_group->key.objectid + - block_group->key.offset)) - goto have_block_group; goto loop; } + /* we are all good, lets return */ ins->objectid = search_start; ins->offset = num_bytes; @@ -4460,18 +4844,18 @@ checks: btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - - update_reserved_extents(block_group, num_bytes, 1); - - /* we are all good, lets return */ break; loop: failed_cluster_refill = false; failed_alloc = false; + BUG_ON(index != get_block_group_index(block_group)); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + goto search; + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for * for them to make caching progress. Also * determine the best possible bg to cache @@ -4485,6 +4869,7 @@ loop: if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && (found_uncached_bg || empty_size || empty_cluster || allowed_chunk_alloc)) { + index = 0; if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; loop++; @@ -4567,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups) { struct btrfs_block_group_cache *cache; + int index = 0; spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - - info->bytes_super), + info->bytes_readonly), (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" - "\n", + printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + "reserved=%llu, may_use=%llu, readonly=%llu\n", (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_used, (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_reserved, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_root, - (unsigned long long)info->bytes_super, - (unsigned long long)info->bytes_reserved); + (unsigned long long)info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) return; down_read(&info->groups_sem); - list_for_each_entry(cache, &info->block_groups, list) { +again: + list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); printk(KERN_INFO "block group %llu has %llu bytes, %llu used " "%llu pinned %llu reserved\n", @@ -4603,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; up_read(&info->groups_sem); } @@ -4628,9 +5014,8 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, ins, - trans->alloc_exclude_start, - trans->alloc_exclude_nr, data); + search_start, search_end, hint_byte, + ins, data); if (ret == -ENOSPC && num_bytes > min_alloc_size) { num_bytes = num_bytes >> 1; @@ -4668,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - update_reserved_extents(cache, len, 0); + update_reserved_bytes(cache, len, 0, 1); btrfs_put_block_group(cache); return ret; @@ -4731,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -4792,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -4869,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - update_reserved_extents(block_group, ins->offset, 1); + ret = update_reserved_bytes(block_group, ins->offset, 1, 1); + BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); return ret; } -/* - * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * - * returns 0 if everything worked, non-zero otherwise. - */ -static int alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 empty_size, u64 hint_byte, u64 search_end, - struct btrfs_key *ins) -{ - int ret; - u64 flags = 0; - - ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); - if (ret) - return ret; - - if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent == 0) - parent = ins->objectid; - flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else - BUG_ON(parent > 0); - - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; - - ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); - BUG_ON(ret); - } - - if (root_objectid == root->root_key.objectid) { - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) + num_bytes; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - } - return ret; -} - struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -4974,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, return buf; } +static struct btrfs_block_rsv * +use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize) +{ + struct btrfs_block_rsv *block_rsv; + int ret; + + block_rsv = get_block_rsv(trans, root); + + if (block_rsv->size == 0) { + ret = reserve_metadata_bytes(block_rsv, blocksize); + if (ret) + return ERR_PTR(ret); + return block_rsv; + } + + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + WARN_ON(1); + printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", + block_rsv->size, block_rsv->reserved, + block_rsv->freed[0], block_rsv->freed[1]); + + return ERR_PTR(-ENOSPC); +} + +static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +{ + block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_release_bytes(block_rsv, NULL, 0); +} + /* - * helper function to allocate a block for a given tree + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, @@ -4985,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 hint, u64 empty_size) { struct btrfs_key ins; - int ret; + struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + u64 flags = 0; + int ret; + - ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, - key, level, empty_size, hint, (u64)-1, &ins); + block_rsv = use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); + + ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, + empty_size, hint, (u64)-1, &ins, 0); if (ret) { - BUG_ON(ret > 0); + unuse_block_rsv(block_rsv, blocksize); return ERR_PTR(ret); } buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); + BUG_ON(IS_ERR(buf)); + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, + ins.offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); + BUG_ON(ret); + } return buf; } @@ -5321,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { - int ret = 0; + int ret; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; @@ -5399,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, - root->root_key.objectid, level, 0); - BUG_ON(ret); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; - return ret; + return 0; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@ -5483,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5501,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) wc = kzalloc(sizeof(*wc), GFP_NOFS); BUG_ON(!wc); - trans = btrfs_start_transaction(tree_root, 1); + trans = btrfs_start_transaction(tree_root, 0); + if (block_rsv) + trans->block_rsv = block_rsv; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); @@ -5589,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) } BUG_ON(wc->level == 0); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) { + if (btrfs_should_end_transaction(trans, tree_root)) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); BUG_ON(ret); - btrfs_end_transaction(trans, tree_root); - trans = btrfs_start_transaction(tree_root, 1); - } else { - unsigned long update; - update = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (update) - btrfs_run_delayed_refs(trans, tree_root, - update); + btrfs_end_transaction_throttle(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 0); + if (block_rsv) + trans->block_rsv = block_rsv; } } btrfs_release_path(root, path); @@ -5632,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) kfree(root); } out: - btrfs_end_transaction(trans, tree_root); + btrfs_end_transaction_throttle(trans, tree_root); kfree(wc); btrfs_free_path(path); return err; @@ -7228,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int __alloc_chunk_for_shrink(struct btrfs_root *root, - struct btrfs_block_group_cache *shrink_block_group, - int force) +static int set_block_group_ro(struct btrfs_block_group_cache *cache) { - struct btrfs_trans_handle *trans; - u64 new_alloc_flags; - u64 calc; + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + int ret = -ENOSPC; - spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) + - shrink_block_group->reserved > 0) { - spin_unlock(&shrink_block_group->lock); + if (cache->ro) + return 0; - trans = btrfs_start_transaction(root, 1); - spin_lock(&shrink_block_group->lock); + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + + if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + + sinfo->bytes_may_use + sinfo->bytes_readonly + + cache->reserved_pinned + num_bytes < sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + sinfo->bytes_reserved += cache->reserved_pinned; + cache->reserved_pinned = 0; + cache->ro = 1; + ret = 0; + } + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + return ret; +} - new_alloc_flags = update_block_group_flags(root, - shrink_block_group->flags); - if (new_alloc_flags != shrink_block_group->flags) { - calc = - btrfs_block_group_used(&shrink_block_group->item); - } else { - calc = shrink_block_group->key.offset; - } - spin_unlock(&shrink_block_group->lock); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) - do_chunk_alloc(trans, root->fs_info->extent_root, - calc + 2 * 1024 * 1024, new_alloc_flags, force); +{ + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; - btrfs_end_transaction(trans, root); - } else - spin_unlock(&shrink_block_group->lock); - return 0; -} + BUG_ON(cache->ro); + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) + do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group) + ret = set_block_group_ro(cache); + if (!ret) + goto out; + alloc_flags = get_alloc_profile(root, cache->space_info->flags); + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + if (ret < 0) + goto out; + ret = set_block_group_ro(cache); +out: + btrfs_end_transaction(trans, root); + return ret; +} +int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) { - __alloc_chunk_for_shrink(root, group, 1); - set_block_group_readonly(group); + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + cache->ro = 0; + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); return 0; } @@ -7436,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ synchronize_rcu(); + release_global_block_rsv(info); + while(!list_empty(&info->space_info)) { space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } list_del(&space_info->list); kfree(space_info); } return 0; } +static void __link_block_group(struct btrfs_space_info *space_info, + struct btrfs_block_group_cache *cache) +{ + int index = get_block_group_index(cache); + + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -7468,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) while (1) { ret = find_first_block_group(root, path, &key); - if (ret > 0) { - ret = 0; - goto error; - } + if (ret > 0) + break; if (ret != 0) goto error; @@ -7480,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) { ret = -ENOMEM; - break; + goto error; } atomic_set(&cache->count, 1); @@ -7537,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root) BUG_ON(ret); cache->space_info = space_info; spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups); - up_write(&space_info->groups_sem); + __link_block_group(space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_readonly(cache); + set_block_group_ro(cache); } + + list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { + if (!(get_alloc_profile(root, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, &space_info->block_groups[3], list) + set_block_group_ro(cache); + list_for_each_entry(cache, &space_info->block_groups[4], list) + set_block_group_ro(cache); + } + + init_global_block_rsv(info); ret = 0; error: btrfs_free_path(path); @@ -7611,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, BUG_ON(ret); spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&cache->space_info->groups_sem); - list_add_tail(&cache->list, &cache->space_info->block_groups); - up_write(&cache->space_info->groups_sem); + __link_block_group(cache->space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2d0368..a4080c2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) return state; } -static void free_extent_state(struct extent_state *state) +void free_extent_state(struct extent_state *state) { if (!state) return; @@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree, } static int set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->set_bit_hook) { return tree->ops->set_bit_hook(tree->mapping->host, - state->start, state->end, - state->state, bits); + state, bits); } return 0; } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); @@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int bits) + int *bits) { struct rb_node *node; + int bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; if (end < start) { @@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree, if (ret) return ret; - if (bits & EXTENT_DIRTY) + if (bits_to_set & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - state->state |= bits; + state->state |= bits_to_set; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, * struct is freed and removed from the tree */ static int clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, int bits, int wake, - int delete) + struct extent_state *state, + int *bits, int wake) { - int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int bits_to_clear = *bits & ~EXTENT_CTLBITS; int ret = state->state & bits_to_clear; - if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; @@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree, state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); - if (delete || state->state == 0) { + if (state->state == 0) { if (state->tree) { - clear_state_cb(tree, state, state->state); rb_erase(&state->rb_node, &tree->state); state->tree = NULL; free_extent_state(state); @@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int set = 0; int clear = 0; + if (delete) + bits |= ~EXTENT_CTLBITS; + bits |= EXTENT_FIRST_DELALLOC; + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: @@ -580,8 +581,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, wake, - delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -602,7 +602,7 @@ hit_next: if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, wake, delete); + set |= clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; @@ -613,7 +613,7 @@ hit_next: else next_node = NULL; - set |= clear_state_bit(tree, state, bits, wake, delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -706,19 +706,19 @@ out: static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int bits) + int *bits) { int ret; + int bits_to_set = *bits & ~EXTENT_CTLBITS; ret = set_state_cb(tree, state, bits); if (ret) return ret; - - if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - state->state |= bits; + state->state |= bits_to_set; return 0; } @@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state, * [start, end] is inclusive This takes the tree lock. */ -static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, - gfp_t mask) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -778,7 +778,7 @@ again: */ node = tree_search(tree, start); if (!node) { - err = insert_state(tree, prealloc, start, end, bits); + err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; BUG_ON(err == -EEXIST); goto out; @@ -802,7 +802,7 @@ hit_next: goto out; } - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; @@ -852,7 +852,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; cache_state(state, cached_state); @@ -877,7 +877,7 @@ hit_next: else this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, - bits); + &bits); BUG_ON(err == -EEXIST); if (err) { prealloc = NULL; @@ -903,7 +903,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - err = set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, &bits); if (err) { prealloc = NULL; goto out; @@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, - NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, @@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; - if (op & EXTENT_CLEAR_ACCOUNTING) - clear_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | @@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, if (tree->ops && tree->ops->submit_bio_hook) tree->ops->submit_bio_hook(page->mapping->host, rw, bio, - mirror_num, bio_flags); + mirror_num, bio_flags, start); else submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) @@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; + struct btrfs_ordered_extent *ordered; int ret; int nr = 0; size_t page_offset = 0; @@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, set_page_extent_mapped(page); end = page_end; - lock_extent(tree, start, end, GFP_NOFS); + while (1) { + lock_extent(tree, start, end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bbab481..5691c7b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -16,7 +16,9 @@ #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) +#define EXTENT_FIRST_DELALLOC (1 << 12) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* flags for bio submission */ #define EXTENT_BIO_COMPRESSED 1 @@ -47,7 +49,7 @@ struct extent_state; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags); + unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -69,10 +71,10 @@ struct extent_io_ops { struct extent_state *state); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); + int (*set_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long bits); + int *bits); int (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, unsigned long bits); +void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54a2550..a562a25 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst) +static int __btrfs_lookup_bio_sums(struct btrfs_root *root, + struct inode *inode, struct bio *bio, + u64 logical_offset, u32 *dst, int dio) { u32 sum; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; - u64 offset; + u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; @@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, WARN_ON(bio->bi_vcnt <= 0); disk_bytenr = (u64)bio->bi_sector << 9; + if (dio) + offset = logical_offset; while (bio_index < bio->bi_vcnt) { - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + if (!dio) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); if (ret == 0) goto found; @@ -238,6 +242,7 @@ found: else set_state_private(io_tree, offset, sum); disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; bio_index++; bvec++; } @@ -245,6 +250,18 @@ found: return 0; } +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); +} + +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 offset, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); +} + int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list) { @@ -657,6 +674,9 @@ again: goto found; } ret = PTR_ERR(item); + if (ret != -EFBIG && ret != -ENOENT) + goto fail_unlock; + if (ret == -EFBIG) { u32 item_size; /* we found one, but it isn't big enough yet */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 29ff749..787b50a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -46,32 +46,42 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, - const char __user *buf) + struct iov_iter *i) { - long page_fault = 0; - int i; + size_t copied; + int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); - for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[i]; - fault_in_pages_readable(buf, count); + struct page *page = prepared_pages[pg]; +again: + if (unlikely(iov_iter_fault_in_readable(i, count))) + return -EFAULT; /* Copy data from userspace to the current page */ - kmap(page); - page_fault = __copy_from_user(page_address(page) + offset, - buf, count); + copied = iov_iter_copy_from_user(page, i, offset, count); + /* Flush processor's dcache for this page */ flush_dcache_page(page); - kunmap(page); - buf += count; - write_bytes -= count; + iov_iter_advance(i, copied); + write_bytes -= copied; - if (page_fault) - break; + if (unlikely(copied == 0)) { + count = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + + if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + offset += copied; + } else { + pg++; + offset = 0; + } } - return page_fault ? -EFAULT : 0; + return 0; } /* @@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, NULL); - if (err) - return err; + BUG_ON(err); for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, * at this time. */ } - return err; + return 0; } /* @@ -823,45 +832,46 @@ again: return 0; } -static ssize_t btrfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - loff_t pos; + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *pinned[2]; + struct page **pages = NULL; + struct iov_iter i; + loff_t *ppos = &iocb->ki_pos; loff_t start_pos; ssize_t num_written = 0; ssize_t err = 0; + size_t count; + size_t ocount; int ret = 0; - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page **pages = NULL; int nrptrs; - struct page *pinned[2]; unsigned long first_index; unsigned long last_index; int will_write; + int buffered = 0; will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE / (sizeof(struct page *))); pinned[0] = NULL; pinned[1] = NULL; - pos = *ppos; start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - /* do the reserve before the mutex lock in case we have to do some - * flushing. We wouldn't deadlock, but this is more polite. - */ - err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (err) - goto out_nolock; - mutex_lock(&inode->i_mutex); + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + goto out; + count = ocount; + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, goto out; file_update_time(file); + BTRFS_I(inode)->sequence++; + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = generic_file_direct_write(iocb, iov, &nr_segs, + pos, ppos, count, + ocount); + /* + * the generic O_DIRECT will update in-memory i_size after the + * DIOs are done. But our endio handlers that update the on + * disk i_size never update past the in memory i_size. So we + * need one more update here to catch any additions to the + * file + */ + if (inode->i_size != BTRFS_I(inode)->disk_i_size) { + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } + if (num_written < 0) { + ret = num_written; + num_written = 0; + goto out; + } else if (num_written == count) { + /* pick up pos changes done by the generic code */ + pos = *ppos; + goto out; + } + /* + * We are going to do buffered for the rest of the range, so we + * need to make sure to invalidate the buffered pages when we're + * done. + */ + buffered = 1; + pos += num_written; + } + + iov_iter_init(&i, iov, nr_segs, count, num_written); + nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / + (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); /* generic_write_checks can change our pos */ start_pos = pos; - BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + count) >> PAGE_CACHE_SHIFT; + last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; /* * there are lots of better ways to do this, but this code @@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unlock_page(pinned[0]); } } - if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { pinned[1] = grab_cache_page(inode->i_mapping, last_index); if (!PageUptodate(pinned[1])) { ret = btrfs_readpage(NULL, pinned[1]); @@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } } - while (count > 0) { + while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(count, nrptrs * - (size_t)PAGE_CACHE_SIZE - + size_t write_bytes = min(iov_iter_count(&i), + nrptrs * (size_t)PAGE_CACHE_SIZE - offset); size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_check_data_free_space(root, inode, write_bytes); + ret = btrfs_delalloc_reserve_space(inode, write_bytes); if (ret) goto out; @@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, pos, first_index, last_index, write_bytes); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); + btrfs_delalloc_release_space(inode, write_bytes); goto out; } ret = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, buf); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); - btrfs_drop_pages(pages, num_pages); - goto out; + write_bytes, pages, &i); + if (ret == 0) { + dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); } - ret = dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); btrfs_drop_pages(pages, num_pages); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); + btrfs_delalloc_release_space(inode, write_bytes); goto out; } @@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, btrfs_throttle(root); } - buf += write_bytes; - count -= write_bytes; pos += write_bytes; num_written += write_bytes; @@ -976,9 +1016,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); -out_nolock: kfree(pages); if (pinned[0]) page_cache_release(pinned[0]); @@ -1008,7 +1046,7 @@ out_nolock: num_written = err; if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); if (ret == 0) { @@ -1023,7 +1061,7 @@ out_nolock: btrfs_end_transaction(trans, root); } } - if (file->f_flags & O_DIRECT) { + if (file->f_flags & O_DIRECT && buffered) { invalidate_mapping_pages(inode->i_mapping, start_pos >> PAGE_CACHE_SHIFT, (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); @@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * important optimization for directories because holding the mutex prevents * new operations on the dir while we write to disk. */ -int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +int btrfs_sync_file(struct file *file, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1104,9 +1143,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (file && file->private_data) btrfs_ioctl_trans_end(file); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto out; } @@ -1161,7 +1200,7 @@ const struct file_operations btrfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .splice_read = generic_file_splice_read, - .write = btrfs_file_write, + .aio_write = btrfs_file_aio_write, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 72ce3c1..64f1150 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, return 0; } +struct btrfs_inode_ref * +btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod) +{ + struct btrfs_key key; + struct btrfs_inode_ref *ref; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + if (!find_name_in_backref(path, name, name_len, &ref)) + return NULL; + return ref; +} + int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d601629..fa6ccc1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len, compressed_size, compressed_pages); BUG_ON(ret); + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@ -414,6 +415,7 @@ again: trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ if (ret || total_in < (actual_end - start)) { @@ -439,7 +441,6 @@ again: start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); btrfs_end_transaction(trans, root); @@ -697,6 +698,38 @@ retry: return 0; } +static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; +} + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode, trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; actual_end = min_t(u64, isize, end + 1); @@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); @@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); - - read_lock(&BTRFS_I(inode)->extent_tree.lock); - em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, - start, num_bytes); - if (em) { - /* - * if block start isn't an actual block number then find the - * first block in this inode and use that as a hint. If that - * block is also bogus then just don't worry about it. - */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - if (em) - free_extent_map(em); - } else { - alloc_hint = em->block_start; - free_extent_map(em); - } - } - read_unlock(&BTRFS_I(inode)->extent_tree.lock); + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { @@ -1174,6 +1185,13 @@ out_check: num_bytes, num_bytes, type); BUG_ON(ret); + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, cur_offset, + num_bytes); + BUG_ON(ret); + } + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | @@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, } static int btrfs_split_extent_hook(struct inode *inode, - struct extent_state *orig, u64 split) + struct extent_state *orig, u64 split) { + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_inc(&BTRFS_I(inode)->outstanding_extents); return 0; } @@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_dec(&BTRFS_I(inode)->outstanding_extents); return 0; } @@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static int btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* @@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_delalloc_reserve_space(root, inode, end - start + 1); + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else + atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += end - start + 1; - root->fs_info->delalloc_bytes += end - start + 1; + BTRFS_I(inode)->delalloc_bytes += len; + root->fs_info->delalloc_bytes += len; if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, * extent_io.c clear_bit_hook, see set_bit_hook for why */ static int btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long bits) + struct extent_state *state, int *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - if (bits & EXTENT_DO_ACCOUNTING) { - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); - } + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else if (!(*bits & EXTENT_DO_ACCOUNTING)) + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + if (*bits & EXTENT_DO_ACCOUNTING) + btrfs_delalloc_release_metadata(inode, len); + + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); - if (state->end - state->start + 1 > - root->fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs warning: delalloc account " - "%llu %llu\n", - (unsigned long long) - state->end - state->start + 1, - (unsigned long long) - root->fs_info->delalloc_bytes); - btrfs_delalloc_free_space(root, inode, (u64)-1); - root->fs_info->delalloc_bytes = 0; - BTRFS_I(inode)->delalloc_bytes = 0; - } else { - btrfs_delalloc_free_space(root, inode, - state->end - - state->start + 1); - root->fs_info->delalloc_bytes -= state->end - - state->start + 1; - BTRFS_I(inode)->delalloc_bytes -= state->end - - state->start + 1; - } + root->fs_info->delalloc_bytes -= len; + BTRFS_I(inode)->delalloc_bytes -= len; + if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_del_init(&BTRFS_I(inode)->delalloc_inodes); @@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, */ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * are inserted into the btree */ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; return btrfs_map_bio(root, rw, bio, mirror_num, 1); @@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * on write, or reading the csums from the tree before a read */ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, /* we're doing a write, do the async checksumming */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, - bio_flags, __btrfs_submit_bio_start, + bio_flags, bio_offset, + __btrfs_submit_bio_start, __btrfs_submit_bio_done); } @@ -1520,6 +1525,7 @@ again: goto again; } + BUG(); btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); out: @@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; @@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); } goto out; } @@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 0, &cached_state, GFP_NOFS); trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compressed = 1; @@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - /* this also removes the ordered extent from the tree */ btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); out: + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, failrec->last_mirror, - failrec->bio_flags); + failrec->bio_flags, 0); return 0; } @@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) } /* + * calculate extra metadata reservation when snapshotting a subvolume + * contains orphan files. + */ +void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + + root = pending->root; + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + block_rsv = root->orphan_block_rsv; + + /* orphan block reservation for the snapshot */ + num_bytes = block_rsv->size; + + /* + * after the snapshot is created, COWing tree blocks may use more + * space than it frees. So we should make sure there is enough + * reserved space. + */ + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes += block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + } + + *bytes_to_reserve += num_bytes; +} + +void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *snap = pending->snap; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + int ret; + + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + /* refill source subvolume's orphan block reservation */ + block_rsv = root->orphan_block_rsv; + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes = block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + root->orphan_block_rsv, + num_bytes); + BUG_ON(ret); + } + + /* setup orphan block reservation for the snapshot */ + block_rsv = btrfs_alloc_block_rsv(snap); + BUG_ON(!block_rsv); + + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); + snap->orphan_block_rsv = block_rsv; + + num_bytes = root->orphan_block_rsv->size; + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + block_rsv, num_bytes); + BUG_ON(ret); + +#if 0 + /* insert orphan item for the snapshot */ + WARN_ON(!root->orphan_item_inserted); + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + snap->root_key.objectid); + BUG_ON(ret); + snap->orphan_item_inserted = 1; +#endif +} + +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* + * This is called in transaction commmit time. If there are no orphan + * files in the subvolume, it removes orphan item and frees block_rsv + * structure. + */ +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + if (!list_empty(&root->orphan_list) || + root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) + return; + + if (root->orphan_item_inserted && + btrfs_root_refs(&root->root_item) > 0) { + ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + root->orphan_item_inserted = 0; + } + + if (root->orphan_block_rsv) { + WARN_ON(root->orphan_block_rsv->size > 0); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + } +} + +/* * This creates an orphan entry for the given inode in case something goes * wrong in the middle of an unlink/truncate. + * + * NOTE: caller of this function should reserve 5 units of metadata for + * this function. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; + struct btrfs_block_rsv *block_rsv = NULL; + int reserve = 0; + int insert = 0; + int ret; - spin_lock(&root->list_lock); + if (!root->orphan_block_rsv) { + block_rsv = btrfs_alloc_block_rsv(root); + BUG_ON(!block_rsv); + } - /* already on the orphan list, we're good */ - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + spin_lock(&root->orphan_lock); + if (!root->orphan_block_rsv) { + root->orphan_block_rsv = block_rsv; + } else if (block_rsv) { + btrfs_free_block_rsv(root, block_rsv); + block_rsv = NULL; + } + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); +#if 0 + /* + * For proper ENOSPC handling, we should do orphan + * cleanup when mounting. But this introduces backward + * compatibility issue. + */ + if (!xchg(&root->orphan_item_inserted, 1)) + insert = 2; + else + insert = 1; +#endif + insert = 1; + } else { + WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved); } - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + if (!BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 1; + reserve = 1; + } + spin_unlock(&root->orphan_lock); - spin_unlock(&root->list_lock); + if (block_rsv) + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* - * insert an orphan item to track this unlinked/truncated file - */ - ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + /* grab metadata reservation from transaction handle */ + if (reserve) { + ret = btrfs_orphan_reserve_metadata(trans, inode); + BUG_ON(ret); + } - return ret; + /* insert an orphan item to track this unlinked/truncated file */ + if (insert >= 1) { + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } + + /* insert an orphan item to track subvolume contains orphan files */ + if (insert >= 2) { + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + return 0; } /* @@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + int delete_item = 0; + int release_rsv = 0; int ret = 0; - spin_lock(&root->list_lock); - - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + spin_lock(&root->orphan_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + list_del_init(&BTRFS_I(inode)->i_orphan); + delete_item = 1; } - list_del_init(&BTRFS_I(inode)->i_orphan); - if (!trans) { - spin_unlock(&root->list_lock); - return 0; + if (BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 0; + release_rsv = 1; } + spin_unlock(&root->orphan_lock); - spin_unlock(&root->list_lock); + if (trans && delete_item) { + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } - ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + if (release_rsv) + btrfs_orphan_release_metadata(inode); - return ret; + return 0; } /* @@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) struct inode *inode; int ret = 0, nr_unlink = 0, nr_truncate = 0; - if (!xchg(&root->clean_orphans, 0)) + if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) return; path = btrfs_alloc_path(); @@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) - break; + BUG_ON(IS_ERR(inode)); /* * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it */ - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->list_lock); + spin_unlock(&root->orphan_lock); /* * if this is a bad inode, means we actually succeeded in @@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * do a destroy_inode */ if (is_bad_inode(inode)) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* this will do delete_inode and everything for us */ iput(inode); } + btrfs_free_path(path); + + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; + + if (root->orphan_block_rsv) + btrfs_block_rsv_release(root, root->orphan_block_rsv, + (u64)-1); + + if (root->orphan_block_rsv || root->orphan_item_inserted) { + trans = btrfs_join_transaction(root, 1); + btrfs_end_transaction(trans, root); + } if (nr_unlink) printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); if (nr_truncate) printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); - - btrfs_free_path(path); } /* @@ -2478,29 +2666,201 @@ out: return ret; } -static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +/* helper to check if there is any shared block in the path */ +static int check_path_shared(struct btrfs_root *root, + struct btrfs_path *path) +{ + struct extent_buffer *eb; + int level; + int ret; + u64 refs; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + eb = path->nodes[level]; + if (!btrfs_block_can_be_shared(root, eb)) + continue; + ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, + &refs, NULL); + if (refs > 1) + return 1; + } + return 0; +} + +/* + * helper to start transaction for unlink and rmdir. + * + * unlink and rmdir are special in btrfs, they do not always free space. + * so in enospc case, we should make sure they will free space before + * allowing them to use the global metadata reservation. + */ +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, + struct dentry *dentry) { - struct btrfs_root *root; struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; struct inode *inode = dentry->d_inode; + u64 index; + int check_link = 1; + int err = -ENOSPC; int ret; - unsigned long nr = 0; - root = BTRFS_I(dir)->root; + trans = btrfs_start_transaction(root, 10); + if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) + return trans; - /* - * 5 items for unlink inode - * 1 for orphan - */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - return ret; + if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return ERR_PTR(-ENOSPC); + + /* check if there is someone else holds reference */ + if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) + return ERR_PTR(-ENOSPC); + + if (atomic_read(&inode->i_count) > 2) + return ERR_PTR(-ENOSPC); + + if (xchg(&root->fs_info->enospc_unlink, 1)) + return ERR_PTR(-ENOSPC); - trans = btrfs_start_transaction(root, 1); + path = btrfs_alloc_path(); + if (!path) { + root->fs_info->enospc_unlink = 0; + return ERR_PTR(-ENOMEM); + } + + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 6); - return PTR_ERR(trans); + btrfs_free_path(path); + root->fs_info->enospc_unlink = 0; + return trans; + } + + path->skip_locking = 1; + path->search_commit_root = 1; + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(dir)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(root, path); + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(root, path); + + if (ret == 0 && S_ISREG(inode->i_mode)) { + ret = btrfs_lookup_file_extent(trans, root, path, + inode->i_ino, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret == 0); + if (check_path_shared(root, path)) + goto out; + btrfs_release_path(root, path); + } + + if (!check_link) { + err = 0; + goto out; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + if (di) { + if (check_path_shared(root, path)) + goto out; + } else { + err = 0; + goto out; } + btrfs_release_path(root, path); + + ref = btrfs_lookup_inode_ref(trans, root, path, + dentry->d_name.name, dentry->d_name.len, + inode->i_ino, dir->i_ino, 0); + if (IS_ERR(ref)) { + err = PTR_ERR(ref); + goto out; + } + BUG_ON(!ref); + if (check_path_shared(root, path)) + goto out; + index = btrfs_inode_ref_index(path->nodes[0], ref); + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + BUG_ON(ret == -ENOENT); + if (check_path_shared(root, path)) + goto out; + + err = 0; +out: + btrfs_free_path(path); + if (err) { + btrfs_end_transaction(trans, root); + root->fs_info->enospc_unlink = 0; + return ERR_PTR(err); + } + + trans->block_rsv = &root->fs_info->global_block_rsv; + return trans; +} + +static void __unlink_end_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (trans->block_rsv == &root->fs_info->global_block_rsv) { + BUG_ON(!root->fs_info->enospc_unlink); + root->fs_info->enospc_unlink = 0; + } + btrfs_end_transaction_throttle(trans, root); +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, dir); @@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); + BUG_ON(ret); - if (inode->i_nlink == 0) + if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + } nr = trans->blocks_used; - - btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 6); + __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; int err = 0; - int ret; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; unsigned long nr = 0; @@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; - ret = btrfs_reserve_metadata_space(root, 5); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 5); + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) return PTR_ERR(trans); - } btrfs_set_trans_block_group(trans, dir); @@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) btrfs_i_size_write(inode, 0); out: nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 5); + __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); - if (ret && !err) - err = ret; return err; } @@ -3029,6 +3380,7 @@ out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); + BUG_ON(ret); } btrfs_free_path(path); return err; @@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) - goto out; - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) goto out; @@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) again: page = grab_cache_page(mapping, index); if (!page) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; } @@ -3132,8 +3479,7 @@ again: out_unlock: if (ret) - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; + struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 mask = root->sectorsize - 1; u64 hole_start = (inode->i_size + mask) & ~mask; @@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) u64 hint_byte = 0; hole_size = last_byte - cur_offset; - err = btrfs_reserve_metadata_space(root, 2); - if (err) + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); break; - - trans = btrfs_start_transaction(root, 1); + } btrfs_set_trans_block_group(trans, inode); err = btrfs_drop_extents(trans, inode, cur_offset, @@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) last_byte - 1, 0); btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 2); } free_extent_map(em); + em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } + free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); return err; @@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) } } - ret = btrfs_reserve_metadata_space(root, 1); - if (ret) - return ret; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); ret = btrfs_orphan_add(trans, inode); @@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 1); btrfs_btree_balance_dirty(root, nr); if (attr->ia_size > inode->i_size) { @@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) i_size_write(inode, attr->ia_size); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; + BUG_ON(!trans->block_rsv); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode) btrfs_i_size_write(inode, 0); while (1) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + trans->block_rsv = root->orphan_block_rsv; + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; + } + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); if (ret != -EAGAIN) break; @@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode) btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root, nr); + } if (ret == 0) { @@ -3596,40 +3956,10 @@ again: return 0; } -static noinline void init_btrfs_i(struct inode *inode) -{ - struct btrfs_inode *bi = BTRFS_I(inode); - - bi->generation = 0; - bi->sequence = 0; - bi->last_trans = 0; - bi->last_sub_trans = 0; - bi->logged_trans = 0; - bi->delalloc_bytes = 0; - bi->reserved_bytes = 0; - bi->disk_i_size = 0; - bi->flags = 0; - bi->index_cnt = (u64)-1; - bi->last_unlink_trans = 0; - bi->ordered_data_close = 0; - bi->force_compress = 0; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, - inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, - inode->i_mapping, GFP_NOFS); - INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - mutex_init(&BTRFS_I(inode)->log_mutex); -} - static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; inode->i_ino = args->ino; - init_btrfs_i(inode); BTRFS_I(inode)->root = args->root; btrfs_set_inode_space_info(args->root, inode); return 0; @@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s, if (!inode) return ERR_PTR(-ENOMEM); - init_btrfs_i(inode); - BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); BTRFS_I(inode)->dummy_inode = 1; @@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct btrfs_trans_handle *trans; int ret = 0; - if (root->fs_info->btree_inode == inode) + if (BTRFS_I(inode)->dummy_inode) return 0; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; + int ret; + + if (BTRFS_I(inode)->dummy_inode) + return; trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); - btrfs_update_inode(trans, root, inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret == -ENOSPC) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans, root); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + if (printk_ratelimit()) { + printk(KERN_ERR "btrfs: fail to " + "dirty inode %lu error %ld\n", + inode->i_ino, PTR_ERR(trans)); + } + return; + } + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + if (printk_ratelimit()) { + printk(KERN_ERR "btrfs: fail to " + "dirty inode %lu error %d\n", + inode->i_ino, ret); + } + } + } btrfs_end_transaction(trans, root); } @@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * btrfs_get_inode_index_count has an explanation for the magic * number */ - init_btrfs_i(inode); BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; @@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; + /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: - btrfs_unreserve_metadata_space(root, 5); + btrfs_btree_balance_dirty(root, nr); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); return err; } @@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; - int err; int drop_inode = 0; + int err; unsigned long nr = 0; u64 objectid; u64 index = 0; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, @@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: - btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; - /* - * 1 item for inode ref - * 2 items for dir items - */ - err = btrfs_reserve_metadata_space(root, 3); - if (err) - return err; - btrfs_inc_nlink(inode); err = btrfs_set_inode_index(dir, &index); if (err) goto fail; - trans = btrfs_start_transaction(root, 1); + /* + * 1 item for inode ref + * 2 items for dir items + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto fail; + } btrfs_set_trans_block_group(trans, dir); atomic_inc(&inode->i_count); @@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: - btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; + /* * 2 items for inode and ref * 2 items for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out_unlock; - } + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_fail; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) out_fail: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - -out_unlock: - btrfs_unreserve_metadata_space(root, 5); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -4770,6 +5098,7 @@ again: } flush_dcache_page(page); } else if (create && PageUptodate(page)) { + WARN_ON(1); if (!trans) { kunmap(page); free_extent_map(em); @@ -4866,11 +5195,651 @@ out: return em; } +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + + trans = btrfs_join_transaction(root, 0); + if (!trans) + return ERR_PTR(-ENOMEM); + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + alloc_hint, (u64)-1, &ins, 1); + if (ret) { + em = ERR_PTR(ret); + goto out; + } + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + + em->start = start; + em->orig_start = em->start; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + em = ERR_PTR(ret); + } +out: + btrfs_end_transaction(trans, root); + return em; +} + +/* + * returns 1 when the nocow is safe, < 1 on error, 0 if the + * block must be cow'd + */ +static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 len) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 disk_bytenr; + u64 backref_offset; + u64 extent_end; + u64 num_bytes; + int slot; + int found_type; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + offset, 0); + if (ret < 0) + goto out; + + slot = path->slots[0]; + if (ret == 1) { + if (slot == 0) { + /* can't find the item, must cow */ + ret = 0; + goto out; + } + slot--; + } + ret = 0; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != inode->i_ino || + key.type != BTRFS_EXTENT_DATA_KEY) { + /* not our file or wrong item type, must cow */ + goto out; + } + + if (key.offset > offset) { + /* Wrong offset, must cow */ + goto out; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (found_type != BTRFS_FILE_EXTENT_REG && + found_type != BTRFS_FILE_EXTENT_PREALLOC) { + /* not a regular extent, must cow */ + goto out; + } + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + backref_offset = btrfs_file_extent_offset(leaf, fi); + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end < offset + len) { + /* extent doesn't include our full range, must cow */ + goto out; + } + + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out; + + /* + * look for other files referencing this extent, if we + * find any we must cow + */ + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + key.offset - backref_offset, disk_bytenr)) + goto out; + + /* + * adjust disk_bytenr and num_bytes to cover just the bytes + * in this extent we are about to write. If there + * are any csums in that range we have to cow in order + * to keep the csums correct + */ + disk_bytenr += backref_offset; + disk_bytenr += offset - key.offset; + num_bytes = min(offset + len, extent_end) - offset; + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out; + /* + * all of the above have passed, it is safe to overwrite this extent + * without cow + */ + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start = iblock << inode->i_blkbits; + u64 len = bh_result->b_size; + struct btrfs_trans_handle *trans; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + return -ENOTBLK; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + /* DIO will do one hole at a time, so just unlock a sector */ + unlock_extent(&BTRFS_I(inode)->io_tree, start, + start + root->sectorsize - 1, GFP_NOFS); + return 0; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) { + len = em->len - (start - em->start); + goto map; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + int ret; + u64 block_start; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + /* + * we're not going to log anything, but we do need + * to make sure the current transaction stays open + * while we look for nocow cross refs + */ + trans = btrfs_join_transaction(root, 0); + if (!trans) + goto must_cow; + + if (can_nocow_odirect(trans, inode, start, len) == 1) { + ret = btrfs_add_ordered_extent_dio(inode, start, + block_start, len, len, type); + btrfs_end_transaction(trans, root); + if (ret) { + free_extent_map(em); + return ret; + } + goto unlock; + } + btrfs_end_transaction(trans, root); + } +must_cow: + /* + * this will cow the extent, reset the len in case we changed + * it above + */ + len = bh_result->b_size; + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) + return PTR_ERR(em); + len = min(len, em->len - (start - em->start)); +unlock: + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, + 0, NULL, GFP_NOFS); +map: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + free_extent_map(em); + + return 0; +} + +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + u32 *csums; + void *private; +}; + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start; + u32 *private = dip->csums; + + start = dip->logical_offset; + do { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + struct page *page = bvec->bv_page; + char *kaddr; + u32 csum = ~(u32)0; + unsigned long flags; + + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_IRQ0); + csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum, bvec->bv_len); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_IRQ0); + local_irq_restore(flags); + + flush_dcache_page(bvec->bv_page); + if (csum != *private) { + printk(KERN_ERR "btrfs csum failed ino %lu off" + " %llu csum %u private %u\n", + inode->i_ino, (unsigned long long)start, + csum, *private); + err = -EIO; + } + } + + start += bvec->bv_len; + private++; + bvec++; + } while (bvec <= bvec_end); + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1, GFP_NOFS); + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static void btrfs_endio_direct_write(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered = NULL; + struct extent_state *cached_state = NULL; + int ret; + + if (err) + goto out_done; + + ret = btrfs_dec_test_ordered_pending(inode, &ordered, + dip->logical_offset, dip->bytes); + if (!ret) + goto out_done; + + BUG_ON(!ordered); + + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out; + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + ret = btrfs_update_inode(trans, root, inode); + err = ret; + goto out; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, 0, + &cached_state, GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + ret = btrfs_mark_extent_written(trans, inode, + ordered->file_offset, + ordered->file_offset + + ordered->len); + if (ret) { + err = ret; + goto out_unlock; + } + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered->file_offset, + ordered->start, + ordered->disk_len, + ordered->len, + ordered->len, + 0, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered->file_offset, ordered->len); + if (ret) { + err = ret; + WARN_ON(1); + goto out_unlock; + } + } + + add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); + btrfs_ordered_update_i_size(inode, 0, ordered); + btrfs_update_inode(trans, root, inode); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, + &cached_state, GFP_NOFS); +out: + btrfs_delalloc_release_metadata(inode, ordered->len); + btrfs_end_transaction(trans, root); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); +out_done: + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 offset) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + BUG_ON(ret); + return 0; +} + +static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio_vec *bvec = bio->bi_io_vec; + u64 start; + int skip_sum; + int write = rw & (1 << BIO_RW); + int ret = 0; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_ordered; + } + dip->csums = NULL; + + if (!skip_sum) { + dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); + if (!dip->csums) { + ret = -ENOMEM; + goto free_ordered; + } + } + + dip->private = bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + + start = dip->logical_offset; + dip->bytes = 0; + do { + dip->bytes += bvec->bv_len; + bvec++; + } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + + dip->disk_bytenr = (u64)bio->bi_sector << 9; + bio->bi_private = dip; + + if (write) + bio->bi_end_io = btrfs_endio_direct_write; + else + bio->bi_end_io = btrfs_endio_direct_read; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto out_err; + + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, 0, 0, + dip->logical_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + if (ret) + goto out_err; + return; + } else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + dip->logical_offset, dip->csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 1); + if (ret) + goto out_err; + return; +out_err: + kfree(dip->csums); + kfree(dip); +free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, + dip->logical_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(bio, ret); +} + +static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + int seg; + size_t size; + unsigned long addr; + unsigned blocksize_mask = root->sectorsize - 1; + ssize_t retval = -EINVAL; + loff_t end = offset; + + if (offset & blocksize_mask) + goto out; + + /* Check the memory alignment. Blocks cannot straddle pages */ + for (seg = 0; seg < nr_segs; seg++) { + addr = (unsigned long)iov[seg].iov_base; + size = iov[seg].iov_len; + end += size; + if ((addr & blocksize_mask) || (size & blocksize_mask)) + goto out; + } + retval = 0; +out: + return retval; +} static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { - return -EINVAL; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + u64 lockstart, lockend; + ssize_t ret; + int writing = rw & WRITE; + int write_bits = 0; + size_t count = iov_length(iov, nr_segs); + + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, + offset, nr_segs)) { + return 0; + } + + lockstart = offset; + lockend = offset + count - 1; + + if (writing) { + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } + + while (1) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state, GFP_NOFS); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) + break; + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + + /* + * we don't use btrfs_set_extent_delalloc because we don't want + * the dirty or uptodate bits + */ + if (writing) { + write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DELALLOC, 0, NULL, &cached_state, + GFP_NOFS); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_LOCKED | write_bits, + 1, 0, &cached_state, GFP_NOFS); + goto out; + } + } + + free_extent_state(cached_state); + cached_state = NULL; + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (ret < 0 && ret != -EIOCBQUEUED) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { + /* + * We're falling back to buffered, unlock the section we didn't + * do IO on. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } +out: + free_extent_state(cached_state); + return ret; } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; @@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - ret = VM_FAULT_SIGBUS; - goto out; - } - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@ -5059,7 +6021,6 @@ again: if ((page->mapping != inode->i_mapping) || (page_start >= size)) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } @@ -5100,7 +6061,6 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); ret = VM_FAULT_SIGBUS; - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); goto out_unlock; } ret = 0; @@ -5127,10 +6087,10 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); if (!ret) return VM_FAULT_LOCKED; unlock_page(page); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out: return ret; } @@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; /* * setattr is responsible for setting the ordered_data_close flag, @@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { + if (!trans) { + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; + } + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + trans = NULL; + continue; + } + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); @@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode) nr = trans->blocks_used; btrfs_end_transaction(trans, root); + trans = NULL; btrfs_btree_balance_dirty(root, nr); - - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); } if (ret == 0 && inode->i_nlink > 0) { @@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping, struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_inode *ei; + struct inode *inode; ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; + + ei->root = NULL; + ei->space_info = NULL; + ei->generation = 0; + ei->sequence = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; - ei->outstanding_extents = 0; - ei->reserved_extents = 0; - ei->root = NULL; + ei->delalloc_bytes = 0; + ei->reserved_bytes = 0; + ei->disk_i_size = 0; + ei->flags = 0; + ei->index_cnt = (u64)-1; + ei->last_unlink_trans = 0; + spin_lock_init(&ei->accounting_lock); + atomic_set(&ei->outstanding_extents, 0); + ei->reserved_extents = 0; + + ei->ordered_data_close = 0; + ei->orphan_meta_reserved = 0; + ei->dummy_inode = 0; + ei->force_compress = 0; + + inode = &ei->vfs_inode; + extent_map_tree_init(&ei->extent_tree, GFP_NOFS); + extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS); + extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS); + mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->ordered_operations); - return &ei->vfs_inode; + RB_CLEAR_NODE(&ei->rb_node); + + return inode; } void btrfs_destroy_inode(struct inode *inode) @@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); + WARN_ON(BTRFS_I(inode)->reserved_extents); /* * This can happen where we create an inode, but somebody else also @@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", inode->i_ino); list_del_init(&BTRFS_I(inode)->i_orphan); } - spin_unlock(&root->list_lock); + spin_unlock(&root->orphan_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - - /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - */ - ret = btrfs_reserve_metadata_space(root, 11); - if (ret) - return ret; - /* * we're using rename to replace one file with another. * and the replacement file is large. Start IO on it now so @@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* close the racy window with snapshot create/destroy ioctl */ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&root->fs_info->subvol_sem); + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 20); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, new_dir); if (dest != root) @@ -5550,7 +6552,6 @@ out_fail: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); - btrfs_unreserve_metadata_space(root, 11); return ret; } @@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return 0; } +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) +{ + struct btrfs_inode *binode; + struct inode *inode = NULL; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(&root->fs_info->delalloc_inodes)) { + binode = list_entry(root->fs_info->delalloc_inodes.next, + struct btrfs_inode, delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (inode) { + list_move_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); + break; + } + + list_del_init(&binode->delalloc_inodes); + cond_resched_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + if (inode) { + write_inode_now(inode, 0); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + return 1; + } + return 0; +} + static int btrfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; /* * 2 items for inode item and ref * 2 items for dir items * 1 item for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto out_fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -out_fail: - btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -5726,33 +6751,28 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode, loff_t actual_len) +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; - u64 num_bytes = end - start; int ret = 0; - u64 i_size; while (num_bytes > 0) { - trans = btrfs_start_transaction(root, 1); - - ret = btrfs_reserve_extent(trans, root, num_bytes, - root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); - if (ret) { - WARN_ON(1); - goto stop_trans; + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; } - ret = btrfs_reserve_metadata_space(root, 3); + ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, + 0, *alloc_hint, (u64)-1, &ins, 1); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, - ins.offset); - goto stop_trans; + btrfs_end_transaction(trans, root); + break; } ret = insert_reserved_file_extent(trans, inode, @@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, num_bytes -= ins.offset; cur_offset += ins.offset; - alloc_hint = ins.objectid + ins.offset; + *alloc_hint = ins.objectid + ins.offset; inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (actual_len > inode->i_size) && - (cur_offset > inode->i_size)) { - + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size = actual_len; + i_size_write(inode, actual_len); else - i_size = cur_offset; - i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); + i_size_write(inode, cur_offset); + i_size_write(inode, cur_offset); + btrfs_ordered_update_i_size(inode, cur_offset, NULL); } ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 3); } return ret; - -stop_trans: - btrfs_end_transaction(trans, root); - return ret; - } static long btrfs_fallocate(struct inode *inode, int mode, @@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } - ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) goto out; @@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = prealloc_file_range(inode, - cur_offset, last_byte, - alloc_hint, mode, offset+len); + ret = btrfs_prealloc_file_range(inode, 0, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); if (ret < 0) { free_extent_map(em); break; } } - if (em->block_start <= EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; free_extent_map(em); cur_offset = last_byte; @@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 97a9783..4cdb98c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root, u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, + 0, &objectid); + if (ret) + return ret; /* * 1 - inode item * 2 - refs * 1 - root item * 2 - dir items */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, - 0, &objectid); - if (ret) - goto fail; + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); @@ -345,13 +341,10 @@ fail: err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; - - btrfs_unreserve_metadata_space(root, 6); return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen) +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items - */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - goto fail; - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - btrfs_unreserve_metadata_space(root, 6); - goto fail; - } - pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); - if (!pending_snapshot->name) { - ret = -ENOMEM; - kfree(pending_snapshot); - btrfs_unreserve_metadata_space(root, 6); - goto fail; - } - memcpy(pending_snapshot->name, name, namelen); - pending_snapshot->name[namelen] = '\0'; + if (!pending_snapshot) + return -ENOMEM; + + btrfs_init_block_rsv(&pending_snapshot->block_rsv); pending_snapshot->dentry = dentry; - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); pending_snapshot->root = root; + + trans = btrfs_start_transaction(root->fs_info->extent_root, 5); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fail; + } + + ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); + BUG_ON(ret); + list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); BUG_ON(ret); - btrfs_unreserve_metadata_space(root, 6); + + ret = pending_snapshot->error; + if (ret) + goto fail; + + btrfs_orphan_cleanup(pending_snapshot->snap); inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { @@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, d_instantiate(dentry, inode); ret = 0; fail: + kfree(pending_snapshot); return ret; } @@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, - name, namelen); + error = create_snapshot(snap_src, dentry); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, name, namelen); @@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file, if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = 1; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) { - ret = -ENOSPC; - break; - } - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); - ret = -ENOSPC; - break; - } + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) + goto err_unlock; again: if (inode->i_size == 0 || i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { @@ -622,8 +598,10 @@ again: } page = grab_cache_page(inode->i_mapping, i); - if (!page) + if (!page) { + ret = -ENOMEM; goto err_reservations; + } if (!PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -631,6 +609,7 @@ again: if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); + ret = -EIO; goto err_reservations; } } @@ -644,8 +623,7 @@ again: wait_on_page_writeback(page); if (PageDirty(page)) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto loop_unlock; } @@ -683,7 +661,6 @@ loop_unlock: page_cache_release(page); mutex_unlock(&inode->i_mutex); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); i++; } @@ -713,9 +690,9 @@ loop_unlock: return 0; err_reservations: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +err_unlock: mutex_unlock(&inode->i_mutex); - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); return ret; } @@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, device->name, (unsigned long long)new_size); if (new_size > old_size) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); } else { @@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_up_write; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, dentry->d_name.name, @@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.drop_level = 0; btrfs_set_root_refs(&dest->root_item, 0); - ret = btrfs_insert_orphan_item(trans, - root->fs_info->tree_root, - dest->root_key.objectid); - BUG_ON(ret); + if (!xchg(&dest->orphan_item_inserted, 1)) { + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + BUG_ON(ret); + } ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); @@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EPERM; goto out; } - btrfs_defrag_root(root, 0); - btrfs_defrag_root(root->fs_info->extent_root, 0); + ret = btrfs_defrag_root(root, 0); + if (ret) + goto out; + ret = btrfs_defrag_root(root->fs_info->extent_root, 0); break; case S_IFREG: if (!(file->f_mode & FMODE_WRITE)) { @@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range->len = (u64)-1; } - btrfs_defrag_file(file, range); + ret = btrfs_defrag_file(file, range); kfree(range); break; + default: + ret = -EINVAL; } out: mnt_drop_write(file->f_path.mnt); @@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_wait_ordered_range(src, off, off+len); } - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - /* punch hole in destination first */ - btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); - /* clone data */ key.objectid = src->i_ino; key.type = BTRFS_EXTENT_DATA_KEY; @@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * note the key will change type as we walk through the * tree. */ - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.objectid = inode->i_ino; new_key.offset = key.offset + destoff - off; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + ret = btrfs_drop_extents(trans, inode, + new_key.offset, + new_key.offset + datal, + &hint_byte, 1); + BUG_ON(ret); + ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - if (ret) - goto out; + BUG_ON(ret); leaf = path->nodes[0]; slot = path->slots[0]; @@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - if (key.offset + datal > off + len) - datal = off + len - key.offset; - /* disko == 0 means it's a hole */ if (!disko) datao = 0; @@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (comp && (skip || trim)) { ret = -EINVAL; + btrfs_end_transaction(trans, root); goto out; } size -= skip + trim; datal -= skip + trim; + + ret = btrfs_drop_extents(trans, inode, + new_key.offset, + new_key.offset + datal, + &hint_byte, 1); + BUG_ON(ret); + ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - if (ret) - goto out; + BUG_ON(ret); if (skip) { u32 start = @@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } btrfs_mark_buffer_dirty(leaf); - } + btrfs_release_path(root, path); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (new_key.offset + datal > inode->i_size) + btrfs_i_size_write(inode, + new_key.offset + datal); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); + } next: btrfs_release_path(root, path); key.offset++; @@ -1717,17 +1727,7 @@ next: ret = 0; out: btrfs_release_path(root, path); - if (ret == 0) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - if (destoff + olen > inode->i_size) - btrfs_i_size_write(inode, destoff + olen); - BTRFS_I(inode)->flags = BTRFS_I(src)->flags; - ret = btrfs_update_inode(trans, root, inode); - } - btrfs_end_transaction(trans, root); unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); - if (ret) - vmtruncate(inode, 0); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a127c0e..e56c72b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + /* * look find the first ordered struct that has this offset, otherwise * the first one less than this offset @@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * The tree is given a single reference on the ordered extent that was * inserted. */ -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) +static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int dio) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); + if (dio) + set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + /* one ref for the tree */ atomic_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, return 0; } +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0); +} + +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 1); +} + /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, - inode, 1); - spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -588,6 +609,47 @@ out: return entry; } +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) + atomic_inc(&entry->refs); + spin_unlock(&tree->lock); + return entry; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c82f76a..8ac3654 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -72,6 +72,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int tyep); + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); @@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e558dd9..05d41e5 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -44,8 +44,12 @@ struct tree_entry { struct backref_node { struct rb_node rb_node; u64 bytenr; - /* objectid tree block owner */ + + u64 new_bytenr; + /* objectid of tree block owner, can be not uptodate */ u64 owner; + /* link to pending, changed or detached list */ + struct list_head list; /* list of upper level blocks reference this block */ struct list_head upper; /* list of child blocks in the cache */ @@ -56,9 +60,9 @@ struct backref_node { struct extent_buffer *eb; /* level of tree block */ unsigned int level:8; - /* 1 if the block is root of old snapshot */ - unsigned int old_root:1; - /* 1 if no child blocks in the cache */ + /* is the block in non-reference counted tree */ + unsigned int cowonly:1; + /* 1 if no child node in the cache */ unsigned int lowest:1; /* is the extent buffer locked */ unsigned int locked:1; @@ -66,6 +70,16 @@ struct backref_node { unsigned int processed:1; /* have backrefs of this block been checked */ unsigned int checked:1; + /* + * 1 if corresponding block has been cowed but some upper + * level block pointers may not point to the new location + */ + unsigned int pending:1; + /* + * 1 if the backref node isn't connected to any other + * backref node. + */ + unsigned int detached:1; }; /* @@ -74,7 +88,6 @@ struct backref_node { struct backref_edge { struct list_head list[2]; struct backref_node *node[2]; - u64 blockptr; }; #define LOWER 0 @@ -83,9 +96,25 @@ struct backref_edge { struct backref_cache { /* red black tree of all backref nodes in the cache */ struct rb_root rb_root; - /* list of backref nodes with no child block in the cache */ + /* for passing backref nodes to btrfs_reloc_cow_block */ + struct backref_node *path[BTRFS_MAX_LEVEL]; + /* + * list of blocks that have been cowed but some block + * pointers in upper level blocks may not reflect the + * new location + */ struct list_head pending[BTRFS_MAX_LEVEL]; - spinlock_t lock; + /* list of backref nodes with no child node */ + struct list_head leaves; + /* list of blocks that have been cowed in current transaction */ + struct list_head changed; + /* list of detached backref node. */ + struct list_head detached; + + u64 last_trans; + + int nr_nodes; + int nr_edges; }; /* @@ -113,15 +142,6 @@ struct tree_block { unsigned int key_ready:1; }; -/* inode vector */ -#define INODEVEC_SIZE 16 - -struct inodevec { - struct list_head list; - struct inode *inode[INODEVEC_SIZE]; - int nr; -}; - #define MAX_EXTENTS 128 struct file_extent_cluster { @@ -138,36 +158,43 @@ struct reloc_control { struct btrfs_root *extent_root; /* inode for moving data */ struct inode *data_inode; - struct btrfs_workers workers; + + struct btrfs_block_rsv *block_rsv; + + struct backref_cache backref_cache; + + struct file_extent_cluster cluster; /* tree blocks have been processed */ struct extent_io_tree processed_blocks; /* map start of tree root to corresponding reloc tree */ struct mapping_tree reloc_root_tree; /* list of reloc trees */ struct list_head reloc_roots; + /* size of metadata reservation for merging reloc trees */ + u64 merging_rsv_size; + /* size of relocated tree nodes */ + u64 nodes_relocated; + u64 search_start; u64 extents_found; - u64 extents_skipped; - int stage; - int create_reloc_root; + + int block_rsv_retries; + + unsigned int stage:8; + unsigned int create_reloc_tree:1; + unsigned int merge_reloc_tree:1; unsigned int found_file_extent:1; - unsigned int found_old_snapshot:1; + unsigned int commit_transaction:1; }; /* stages of data relocation */ #define MOVE_DATA_EXTENTS 0 #define UPDATE_DATA_PTRS 1 -/* - * merge reloc tree to corresponding fs tree in worker threads - */ -struct async_merge { - struct btrfs_work work; - struct reloc_control *rc; - struct btrfs_root *root; - struct completion *done; - atomic_t *num_pending; -}; +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node); +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node); static void mapping_tree_init(struct mapping_tree *tree) { @@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache) cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); +} + +static void backref_cache_cleanup(struct backref_cache *cache) +{ + struct backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct backref_node, lower); + remove_backref_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + BUG_ON(!list_empty(&cache->pending[i])); + BUG_ON(!list_empty(&cache->changed)); + BUG_ON(!list_empty(&cache->detached)); + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + BUG_ON(cache->nr_nodes); + BUG_ON(cache->nr_edges); +} + +static struct backref_node *alloc_backref_node(struct backref_cache *cache) +{ + struct backref_node *node; + + node = kzalloc(sizeof(*node), GFP_NOFS); + if (node) { + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + } + return node; +} + +static void free_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + if (node) { + cache->nr_nodes--; + kfree(node); + } +} + +static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) +{ + struct backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; } -static void backref_node_init(struct backref_node *node) +static void free_backref_edge(struct backref_cache *cache, + struct backref_edge *edge) { - memset(node, 0, sizeof(*node)); - INIT_LIST_HEAD(&node->upper); - INIT_LIST_HEAD(&node->lower); - RB_CLEAR_NODE(&node->rb_node); + if (edge) { + cache->nr_edges--; + kfree(edge); + } } static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, @@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node, edges[idx++] = edge; node = edge->node[UPPER]; } + BUG_ON(node->detached); *index = idx; return node; } @@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[], return NULL; } +static void unlock_node_buffer(struct backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + static void drop_node_buffer(struct backref_node *node) { if (node->eb) { - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } + unlock_node_buffer(node); free_extent_buffer(node->eb); node->eb = NULL; } @@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node) static void drop_backref_node(struct backref_cache *tree, struct backref_node *node) { - BUG_ON(!node->lowest); BUG_ON(!list_empty(&node->upper)); drop_node_buffer(node); + list_del(&node->list); list_del(&node->lower); - - rb_erase(&node->rb_node, &tree->rb_root); - kfree(node); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + free_backref_node(tree, node); } /* @@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache, if (!node) return; - BUG_ON(!node->lowest); + BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct backref_edge, list[LOWER]); upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); - kfree(edge); + free_backref_edge(cache, edge); + + if (RB_EMPTY_NODE(&upper->rb_node)) { + BUG_ON(!list_empty(&node->upper)); + drop_backref_node(cache, node); + node = upper; + node->lowest = 1; + continue; + } /* - * add the node to pending list if no other + * add the node to leaf node list if no other * child block cached. */ if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, - &cache->pending[upper->level]); + list_add_tail(&upper->lower, &cache->leaves); upper->lowest = 1; } } + drop_backref_node(cache, node); } +static void update_backref_node(struct backref_cache *cache, + struct backref_node *node, u64 bytenr) +{ + struct rb_node *rb_node; + rb_erase(&node->rb_node, &cache->rb_root); + node->bytenr = bytenr; + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + BUG_ON(rb_node); +} + +/* + * update backref cache after a transaction commit + */ +static int update_backref_cache(struct btrfs_trans_handle *trans, + struct backref_cache *cache) +{ + struct backref_node *node; + int level = 0; + + if (cache->last_trans == 0) { + cache->last_trans = trans->transid; + return 0; + } + + if (cache->last_trans == trans->transid) + return 0; + + /* + * detached nodes are used to avoid unnecessary backref + * lookup. transaction commit changes the extent tree. + * so the detached nodes are no longer useful. + */ + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->changed)) { + node = list_entry(cache->changed.next, + struct backref_node, list); + list_del_init(&node->list); + BUG_ON(node->pending); + update_backref_node(cache, node, node->new_bytenr); + } + + /* + * some nodes can be left in the pending list if there were + * errors during processing the pending nodes. + */ + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + list_for_each_entry(node, &cache->pending[level], list) { + BUG_ON(!node->pending); + if (node->bytenr == node->new_bytenr) + continue; + update_backref_node(cache, node, node->new_bytenr); + } + } + + cache->last_trans = 0; + return 1; +} + +static int should_ignore_root(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + + if (!root->ref_cows) + return 0; + + reloc_root = root->reloc_root; + if (!reloc_root) + return 0; + + if (btrfs_root_last_snapshot(&reloc_root->root_item) == + root->fs_info->running_transaction->transid - 1) + return 0; + /* + * if there is reloc tree and it was created in previous + * transaction backref lookup can find the reloc tree, + * so backref node for the fs tree root is useless for + * relocation. + */ + return 1; +} + /* * find reloc tree by address of tree root */ @@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * for all upper level blocks that directly/indirectly reference the * block are also cached. */ -static struct backref_node *build_backref_tree(struct reloc_control *rc, - struct backref_cache *cache, - struct btrfs_key *node_key, - int level, u64 bytenr) +static noinline_for_stack +struct backref_node *build_backref_tree(struct reloc_control *rc, + struct btrfs_key *node_key, + int level, u64 bytenr) { + struct backref_cache *cache = &rc->backref_cache; struct btrfs_path *path1; struct btrfs_path *path2; struct extent_buffer *eb; @@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc, unsigned long end; unsigned long ptr; LIST_HEAD(list); + LIST_HEAD(useless); + int cowonly; int ret; int err = 0; @@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc, goto out; } - node = kmalloc(sizeof(*node), GFP_NOFS); + node = alloc_backref_node(cache); if (!node) { err = -ENOMEM; goto out; } - backref_node_init(node); node->bytenr = bytenr; - node->owner = 0; node->level = level; node->lowest = 1; cur = node; @@ -587,17 +780,20 @@ again: #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || key.type == BTRFS_EXTENT_REF_V0_KEY) { - if (key.objectid == key.offset && - key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.type == BTRFS_EXTENT_REF_V0_KEY) { struct btrfs_extent_ref_v0 *ref0; ref0 = btrfs_item_ptr(eb, path1->slots[0], struct btrfs_extent_ref_v0); root = find_tree_root(rc, eb, ref0); - if (root) - cur->root = root; - else - cur->old_root = 1; - break; + if (!root->ref_cows) + cur->cowonly = 1; + if (key.objectid == key.offset) { + if (root && !should_ignore_root(root)) + cur->root = root; + else + list_add(&cur->list, &useless); + break; + } } #else BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); @@ -614,22 +810,20 @@ again: break; } - edge = kzalloc(sizeof(*edge), GFP_NOFS); + edge = alloc_backref_edge(cache); if (!edge) { err = -ENOMEM; goto out; } rb_node = tree_search(&cache->rb_root, key.offset); if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); + upper = alloc_backref_node(cache); if (!upper) { - kfree(edge); + free_backref_edge(cache, edge); err = -ENOMEM; goto out; } - backref_node_init(upper); upper->bytenr = key.offset; - upper->owner = 0; upper->level = cur->level + 1; /* * backrefs for the upper level block isn't @@ -639,11 +833,12 @@ again: } else { upper = rb_entry(rb_node, struct backref_node, rb_node); + BUG_ON(!upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } - list_add(&edge->list[LOWER], &cur->upper); - edge->node[UPPER] = upper; + list_add_tail(&edge->list[LOWER], &cur->upper); edge->node[LOWER] = cur; + edge->node[UPPER] = upper; goto next; } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { @@ -657,11 +852,17 @@ again: goto out; } + if (!root->ref_cows) + cur->cowonly = 1; + if (btrfs_root_level(&root->root_item) == cur->level) { /* tree root */ BUG_ON(btrfs_root_bytenr(&root->root_item) != cur->bytenr); - cur->root = root; + if (should_ignore_root(root)) + list_add(&cur->list, &useless); + else + cur->root = root; break; } @@ -692,11 +893,14 @@ again: if (!path2->nodes[level]) { BUG_ON(btrfs_root_bytenr(&root->root_item) != lower->bytenr); - lower->root = root; + if (should_ignore_root(root)) + list_add(&lower->list, &useless); + else + lower->root = root; break; } - edge = kzalloc(sizeof(*edge), GFP_NOFS); + edge = alloc_backref_edge(cache); if (!edge) { err = -ENOMEM; goto out; @@ -705,16 +909,17 @@ again: eb = path2->nodes[level]; rb_node = tree_search(&cache->rb_root, eb->start); if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); + upper = alloc_backref_node(cache); if (!upper) { - kfree(edge); + free_backref_edge(cache, edge); err = -ENOMEM; goto out; } - backref_node_init(upper); upper->bytenr = eb->start; upper->owner = btrfs_header_owner(eb); upper->level = lower->level + 1; + if (!root->ref_cows) + upper->cowonly = 1; /* * if we know the block isn't shared @@ -744,10 +949,12 @@ again: rb_node); BUG_ON(!upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); } list_add_tail(&edge->list[LOWER], &lower->upper); - edge->node[UPPER] = upper; edge->node[LOWER] = lower; + edge->node[UPPER] = upper; if (rb_node) break; @@ -785,8 +992,13 @@ next: * into the cache. */ BUG_ON(!node->checked); - rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + cowonly = node->cowonly; + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, node->bytenr, + &node->rb_node); + BUG_ON(rb_node); + list_add_tail(&node->lower, &cache->leaves); + } list_for_each_entry(edge, &node->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); @@ -795,6 +1007,14 @@ next: edge = list_entry(list.next, struct backref_edge, list[UPPER]); list_del_init(&edge->list[UPPER]); upper = edge->node[UPPER]; + if (upper->detached) { + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + continue; + } if (!RB_EMPTY_NODE(&upper->rb_node)) { if (upper->lowest) { @@ -807,25 +1027,69 @@ next: } BUG_ON(!upper->checked); - rb_node = tree_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - BUG_ON(rb_node); + BUG_ON(cowonly != upper->cowonly); + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + BUG_ON(rb_node); + } list_add_tail(&edge->list[UPPER], &upper->lower); list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); } + /* + * process useless backref nodes. backref nodes for tree leaves + * are deleted from the cache. backref nodes for upper level + * tree blocks are left in the cache to avoid unnecessary backref + * lookup. + */ + while (!list_empty(&useless)) { + upper = list_entry(useless.next, struct backref_node, list); + list_del_init(&upper->list); + BUG_ON(!list_empty(&upper->upper)); + if (upper == node) + node = NULL; + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + while (!list_empty(&upper->lower)) { + edge = list_entry(upper->lower.next, + struct backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + } + __mark_block_processed(rc, upper); + if (upper->level > 0) { + list_add(&upper->list, &cache->detached); + upper->detached = 1; + } else { + rb_erase(&upper->rb_node, &cache->rb_root); + free_backref_node(cache, upper); + } + } out: btrfs_free_path(path1); btrfs_free_path(path2); if (err) { - INIT_LIST_HEAD(&list); + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, upper); + list_del_init(&lower->upper); + } upper = node; + INIT_LIST_HEAD(&list); while (upper) { if (RB_EMPTY_NODE(&upper->rb_node)) { list_splice_tail(&upper->upper, &list); - kfree(upper); + free_backref_node(cache, upper); } if (list_empty(&list)) @@ -833,15 +1097,104 @@ out: edge = list_entry(list.next, struct backref_edge, list[LOWER]); + list_del(&edge->list[LOWER]); upper = edge->node[UPPER]; - kfree(edge); + free_backref_edge(cache, edge); } return ERR_PTR(err); } + BUG_ON(node && node->detached); return node; } /* + * helper to add backref node for the newly created snapshot. + * the backref node is created by cloning backref node that + * corresponds to root of source tree + */ +static int clone_backref_node(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *src, + struct btrfs_root *dest) +{ + struct btrfs_root *reloc_root = src->reloc_root; + struct backref_cache *cache = &rc->backref_cache; + struct backref_node *node = NULL; + struct backref_node *new_node; + struct backref_edge *edge; + struct backref_edge *new_edge; + struct rb_node *rb_node; + + if (cache->last_trans > 0) + update_backref_cache(trans, cache); + + rb_node = tree_search(&cache->rb_root, src->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, rb_node); + if (node->detached) + node = NULL; + else + BUG_ON(node->new_bytenr != reloc_root->node->start); + } + + if (!node) { + rb_node = tree_search(&cache->rb_root, + reloc_root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(node->detached); + } + } + + if (!node) + return 0; + + new_node = alloc_backref_node(cache); + if (!new_node) + return -ENOMEM; + + new_node->bytenr = dest->node->start; + new_node->level = node->level; + new_node->lowest = node->lowest; + new_node->root = dest; + + if (!node->lowest) { + list_for_each_entry(edge, &node->lower, list[UPPER]) { + new_edge = alloc_backref_edge(cache); + if (!new_edge) + goto fail; + + new_edge->node[UPPER] = new_node; + new_edge->node[LOWER] = edge->node[LOWER]; + list_add_tail(&new_edge->list[UPPER], + &new_node->lower); + } + } + + rb_node = tree_insert(&cache->rb_root, new_node->bytenr, + &new_node->rb_node); + BUG_ON(rb_node); + + if (!new_node->lowest) { + list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { + list_add_tail(&new_edge->list[LOWER], + &new_edge->node[LOWER]->upper); + } + } + return 0; +fail: + while (!list_empty(&new_node->lower)) { + new_edge = list_entry(new_node->lower.next, + struct backref_edge, list[UPPER]); + list_del(&new_edge->list[UPPER]); + free_backref_edge(cache, new_edge); + } + free_backref_node(cache, new_node); + return -ENOMEM; +} + +/* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) @@ -901,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del) return 0; } -/* - * create reloc tree for a given fs tree. reloc tree is just a - * snapshot of the fs tree with special root objectid. - */ -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) { struct btrfs_root *reloc_root; struct extent_buffer *eb; @@ -914,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_key root_key; int ret; - if (root->reloc_root) { - reloc_root = root->reloc_root; - reloc_root->last_trans = trans->transid; - return 0; - } - - if (!root->fs_info->reloc_ctl || - !root->fs_info->reloc_ctl->create_reloc_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - root_item = kmalloc(sizeof(*root_item), GFP_NOFS); BUG_ON(!root_item); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = root->root_key.objectid; + root_key.offset = objectid; - ret = btrfs_copy_root(trans, root, root->commit_root, &eb, - BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (root->root_key.objectid == objectid) { + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + btrfs_set_root_last_snapshot(&root->root_item, + trans->transid - 1); + } else { + /* + * called by btrfs_reloc_post_snapshot_hook. + * the source tree is a reloc tree, all tree blocks + * modified after it was created have RELOC flag + * set in their headers. so it's OK to not update + * the 'last_snapshot'. + */ + ret = btrfs_copy_root(trans, root, root->node, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + } - btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); memcpy(root_item, &root->root_item, sizeof(*root_item)); - btrfs_set_root_refs(root_item, 1); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); - memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); - root_item->drop_level = 0; + + if (root->root_key.objectid == objectid) { + btrfs_set_root_refs(root_item, 0); + memset(&root_item->drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + } btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -957,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, &root_key); BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; + return reloc_root; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + int clear_rsv = 0; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!trans->block_rsv) { + trans->block_rsv = rc->block_rsv; + clear_rsv = 1; + } + reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + if (clear_rsv) + trans->block_rsv = NULL; __add_reloc_root(reloc_root); root->reloc_root = reloc_root; @@ -980,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, reloc_root = root->reloc_root; root_item = &reloc_root->root_item; - if (btrfs_root_refs(root_item) == 0) { + if (root->fs_info->reloc_ctl->merge_reloc_tree && + btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; del = 1; } @@ -1102,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, goto out; } - if (new_bytenr) - *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); ret = 0; out: btrfs_free_path(path); @@ -1114,19 +1503,18 @@ out: * update file extent items in the tree leaf to point to * the new locations. */ -static int replace_file_extents(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root, - struct extent_buffer *leaf, - struct list_head *inode_list) +static noinline_for_stack +int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf) { struct btrfs_key key; struct btrfs_file_extent_item *fi; struct inode *inode = NULL; - struct inodevec *ivec = NULL; u64 parent; u64 bytenr; - u64 new_bytenr; + u64 new_bytenr = 0; u64 num_bytes; u64 end; u32 nritems; @@ -1166,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, * to complete and drop the extent cache */ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - if (!ivec || ivec->nr == INODEVEC_SIZE) { - ivec = kmalloc(sizeof(*ivec), GFP_NOFS); - BUG_ON(!ivec); - ivec->nr = 0; - list_add_tail(&ivec->list, inode_list); - } if (first) { inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; first = 0; } else if (inode && inode->i_ino < key.objectid) { + btrfs_add_delayed_iput(inode); inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; } if (inode && inode->i_ino == key.objectid) { end = key.offset + @@ -1204,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, ret = get_new_location(rc->data_inode, &new_bytenr, bytenr, num_bytes); - if (ret > 0) + if (ret > 0) { + WARN_ON(1); continue; + } BUG_ON(ret < 0); btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); @@ -1225,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, } if (dirty) btrfs_mark_buffer_dirty(leaf); + if (inode) + btrfs_add_delayed_iput(inode); return 0; } @@ -1248,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot, * if no block got replaced, 0 is returned. if there are other * errors, a negative error number is returned. */ -static int replace_path(struct btrfs_trans_handle *trans, - struct btrfs_root *dest, struct btrfs_root *src, - struct btrfs_path *path, struct btrfs_key *next_key, - struct extent_buffer **leaf, - int lowest_level, int max_level) +static noinline_for_stack +int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + int lowest_level, int max_level) { struct extent_buffer *eb; struct extent_buffer *parent; @@ -1263,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans, u64 new_ptr_gen; u64 last_snapshot; u32 blocksize; + int cow = 0; int level; int ret; int slot; BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(lowest_level > 1 && leaf); last_snapshot = btrfs_root_last_snapshot(&src->root_item); - +again: slot = path->slots[lowest_level]; btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); @@ -1286,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans, return 0; } - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); - BUG_ON(ret); + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + } btrfs_set_lock_blocking(eb); if (next_key) { @@ -1331,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans, if (new_bytenr == 0 || old_ptr_gen > last_snapshot || memcmp_node_keys(parent, slot, path, level)) { - if (level <= lowest_level && !leaf) { + if (level <= lowest_level) { ret = 0; break; } @@ -1339,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans, eb = read_tree_block(dest, old_bytenr, blocksize, old_ptr_gen); btrfs_tree_lock(eb); - ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); - BUG_ON(ret); - btrfs_set_lock_blocking(eb); - - if (level <= lowest_level) { - *leaf = eb; - ret = 0; - break; + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); } + btrfs_set_lock_blocking(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -1357,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans, continue; } + if (!cow) { + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + cow = 1; + goto again; + } + btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); btrfs_release_path(src, path); @@ -1562,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root, return 0; } -static void put_inodes(struct list_head *list) -{ - struct inodevec *ivec; - while (!list_empty(list)) { - ivec = list_entry(list->next, struct inodevec, list); - list_del(&ivec->list); - while (ivec->nr > 0) { - ivec->nr--; - iput(ivec->inode[ivec->nr]); - } - kfree(ivec); - } -} - static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) @@ -1608,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; struct btrfs_path *path; - struct extent_buffer *leaf = NULL; + struct extent_buffer *leaf; unsigned long nr; int level; int max_level; int replaced = 0; int ret; int err = 0; + u32 min_reserved; path = btrfs_alloc_path(); if (!path) @@ -1648,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_unlock_up_safe(path, 0); } - if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { - trans = btrfs_start_transaction(root, 1); + min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + memset(&next_key, 0, sizeof(next_key)); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(reloc_root, path); + while (1) { + trans = btrfs_start_transaction(root, 0); + trans->block_rsv = rc->block_rsv; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto out; + ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, + min_reserved, 0); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; } - leaf = path->nodes[0]; - btrfs_unlock_up_safe(path, 1); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - if (ret < 0) - err = ret; - goto out; - } - - memset(&next_key, 0, sizeof(next_key)); - - while (1) { - leaf = NULL; replaced = 0; - trans = btrfs_start_transaction(root, 1); max_level = level; ret = walk_down_reloc_tree(reloc_root, path, &level); @@ -1689,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (!find_next_key(path, level, &key) && btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; - } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_path(trans, root, reloc_root, - path, &next_key, &leaf, - level, max_level); } else { - ret = replace_path(trans, root, reloc_root, - path, &next_key, NULL, - level, max_level); + ret = replace_path(trans, root, reloc_root, path, + &next_key, level, max_level); } if (ret < 0) { err = ret; @@ -1708,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); replaced = 1; - } else if (leaf) { - /* - * no block got replaced, try replacing file extents - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - BUG_ON(ret < 0); } ret = walk_up_reloc_tree(reloc_root, path, &level); @@ -1734,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, root_item->drop_level = level; nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); - /* - * put inodes outside transaction, otherwise we may deadlock. - */ - put_inodes(&inode_list); - if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); } @@ -1765,87 +2109,125 @@ out: sizeof(root_item->drop_progress)); root_item->drop_level = 0; btrfs_set_root_refs(root_item, 0); + btrfs_update_reloc_root(trans, root); } nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); - put_inodes(&inode_list); - if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); return err; } -/* - * callback for the work threads. - * this function merges reloc tree with corresponding fs tree, - * and then drops the reloc tree. - */ -static void merge_func(struct btrfs_work *work) +static noinline_for_stack +int prepare_to_merge(struct reloc_control *rc, int err) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = rc->extent_root; struct btrfs_root *reloc_root; - struct async_merge *async; + struct btrfs_trans_handle *trans; + LIST_HEAD(reloc_roots); + u64 num_bytes = 0; + int ret; + int retries = 0; + + mutex_lock(&root->fs_info->trans_mutex); + rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + rc->merging_rsv_size += rc->nodes_relocated * 2; + mutex_unlock(&root->fs_info->trans_mutex); +again: + if (!err) { + num_bytes = rc->merging_rsv_size; + ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, + num_bytes, &retries); + if (ret) + err = ret; + } + + trans = btrfs_join_transaction(rc->extent_root, 1); + + if (!err) { + if (num_bytes != rc->merging_rsv_size) { + btrfs_end_transaction(trans, rc->extent_root); + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + retries = 0; + goto again; + } + } - async = container_of(work, struct async_merge, work); - reloc_root = async->root; + rc->merge_reloc_tree = 1; + + while (!list_empty(&rc->reloc_roots)) { + reloc_root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&reloc_root->root_list); - if (btrfs_root_refs(&reloc_root->root_item) > 0) { root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); - merge_reloc_root(async->rc, root); - - trans = btrfs_start_transaction(root, 1); + /* + * set reference count to 1, so btrfs_recover_relocation + * knows it should resumes merging + */ + if (!err) + btrfs_set_root_refs(&reloc_root->root_item, 1); btrfs_update_reloc_root(trans, root); - btrfs_end_transaction(trans, root); - } - btrfs_drop_snapshot(reloc_root, 0); + list_add(&reloc_root->root_list, &reloc_roots); + } - if (atomic_dec_and_test(async->num_pending)) - complete(async->done); + list_splice(&reloc_roots, &rc->reloc_roots); - kfree(async); + if (!err) + btrfs_commit_transaction(trans, rc->extent_root); + else + btrfs_end_transaction(trans, rc->extent_root); + return err; } -static int merge_reloc_roots(struct reloc_control *rc) +static noinline_for_stack +int merge_reloc_roots(struct reloc_control *rc) { - struct async_merge *async; struct btrfs_root *root; - struct completion done; - atomic_t num_pending; + struct btrfs_root *reloc_root; + LIST_HEAD(reloc_roots); + int found = 0; + int ret; +again: + root = rc->extent_root; + mutex_lock(&root->fs_info->trans_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->trans_mutex); - init_completion(&done); - atomic_set(&num_pending, 1); + while (!list_empty(&reloc_roots)) { + found = 1; + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); - while (!list_empty(&rc->reloc_roots)) { - root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); - list_del_init(&root->root_list); + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); - async = kmalloc(sizeof(*async), GFP_NOFS); - BUG_ON(!async); - async->work.func = merge_func; - async->work.flags = 0; - async->rc = rc; - async->root = root; - async->done = &done; - async->num_pending = &num_pending; - atomic_inc(&num_pending); - btrfs_queue_worker(&rc->workers, &async->work); + ret = merge_reloc_root(rc, root); + BUG_ON(ret); + } else { + list_del_init(&reloc_root->root_list); + } + btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); } - if (!atomic_dec_and_test(&num_pending)) - wait_for_completion(&done); - + if (found) { + found = 0; + goto again; + } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); return 0; } @@ -1876,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, return btrfs_record_root_in_trans(trans, root); } -/* - * select one tree from trees that references the block. - * for blocks in refernce counted trees, we preper reloc tree. - * if no reloc tree found and reloc_only is true, NULL is returned. - */ -static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], - int *nr, int reloc_only) +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct backref_edge *edges[], int *nr) { struct backref_node *next; struct btrfs_root *root; - int index; - int loop = 0; -again: - index = 0; + int index = 0; + next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - if (!root) { - BUG_ON(!node->old_root); - goto skip; - } - - /* no other choice for non-refernce counted tree */ - if (!root->ref_cows) { - BUG_ON(reloc_only); - break; - } + BUG_ON(!root); + BUG_ON(!root->ref_cows); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); break; } - if (loop) { - btrfs_record_root_in_trans(trans, root); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + + if (next->new_bytenr != root->node->start) { + BUG_ON(next->new_bytenr); + BUG_ON(!list_empty(&next->list)); + next->new_bytenr = root->node->start; + next->root = root; + list_add_tail(&next->list, + &rc->backref_cache.changed); + __mark_block_processed(rc, next); break; } - if (reloc_only || next != node) { - if (!root->reloc_root) - btrfs_record_root_in_trans(trans, root); - root = root->reloc_root; - /* - * if the reloc tree was created in current - * transation, there is no node in backref tree - * corresponds to the root of the reloc tree. - */ - if (btrfs_root_last_snapshot(&root->root_item) == - trans->transid - 1) - break; - } -skip: + WARN_ON(1); root = NULL; next = walk_down_backref(edges, &index); if (!next || next->level <= node->level) break; } + if (!root) + return NULL; - if (!root && !loop && !reloc_only) { - loop = 1; - goto again; + *nr = index; + next = node; + /* setup backref node path for btrfs_reloc_cow_block */ + while (1) { + rc->backref_cache.path[next->level] = next; + if (--index < 0) + break; + next = edges[index]->node[UPPER]; } - - if (root) - *nr = index; - else - *nr = 0; - return root; } +/* + * select a tree root for relocation. return NULL if the block + * is reference counted. we should use do_relocation() in this + * case. return a tree root pointer if the block isn't reference + * counted. return -ENOENT if the block is root of reloc tree. + */ static noinline_for_stack struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, struct backref_node *node) { + struct backref_node *next; + struct btrfs_root *root; + struct btrfs_root *fs_root = NULL; struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - int nr; - return __select_one_root(trans, node, edges, &nr, 0); + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + BUG_ON(!root); + + /* no other choice for non-refernce counted tree */ + if (!root->ref_cows) + return root; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + fs_root = root; + + if (next != node) + return NULL; + + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!fs_root) + return ERR_PTR(-ENOENT); + return fs_root; } static noinline_for_stack -struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], int *nr) +u64 calcu_metadata_size(struct reloc_control *rc, + struct backref_node *node, int reserve) { - return __select_one_root(trans, node, edges, nr, 1); + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + u64 num_bytes = 0; + int index = 0; + + BUG_ON(reserve && node->processed); + + while (next) { + cond_resched(); + while (1) { + if (next->processed && (reserve || next != node)) + break; + + num_bytes += btrfs_level_size(rc->extent_root, + next->level); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } + return num_bytes; } -static void grab_path_buffers(struct btrfs_path *path, - struct backref_node *node, - struct backref_edge *edges[], int nr) +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node) { - int i = 0; - while (1) { - drop_node_buffer(node); - node->eb = path->nodes[node->level]; - BUG_ON(!node->eb); - if (path->locks[node->level]) - node->locked = 1; - path->nodes[node->level] = NULL; - path->locks[node->level] = 0; - - if (i >= nr) - break; + struct btrfs_root *root = rc->extent_root; + u64 num_bytes; + int ret; + + num_bytes = calcu_metadata_size(rc, node, 1) * 2; - edges[i]->blockptr = node->eb->start; - node = edges[i]->node[UPPER]; - i++; + trans->block_rsv = rc->block_rsv; + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, + &rc->block_rsv_retries); + if (ret) { + if (ret == -EAGAIN) + rc->commit_transaction = 1; + return ret; } + + rc->block_rsv_retries = 0; + return 0; +} + +static void release_metadata_space(struct reloc_control *rc, + struct backref_node *node) +{ + u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2; + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); } /* @@ -1999,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path, * in that case this function just updates pointers. */ static int do_relocation(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct backref_node *node, struct btrfs_key *key, struct btrfs_path *path, int lowest) @@ -2019,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans, BUG_ON(lowest && node->eb); path->lowest_level = node->level + 1; + rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { cond_resched(); - if (node->eb && node->eb->start == edge->blockptr) - continue; upper = edge->node[UPPER]; - root = select_reloc_root(trans, upper, edges, &nr); - if (!root) - continue; - - if (upper->eb && !upper->locked) + root = select_reloc_root(trans, rc, upper, edges, &nr); + BUG_ON(!root); + + if (upper->eb && !upper->locked) { + if (!lowest) { + ret = btrfs_bin_search(upper->eb, key, + upper->level, &slot); + BUG_ON(ret); + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (node->eb->start == bytenr) + goto next; + } drop_node_buffer(upper); + } if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); @@ -2040,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans, } BUG_ON(ret > 0); - slot = path->slots[upper->level]; + if (!upper->eb) { + upper->eb = path->nodes[upper->level]; + path->nodes[upper->level] = NULL; + } else { + BUG_ON(upper->eb != path->nodes[upper->level]); + } - btrfs_unlock_up_safe(path, upper->level + 1); - grab_path_buffers(path, upper, edges, nr); + upper->locked = 1; + path->locks[upper->level] = 0; + slot = path->slots[upper->level]; btrfs_release_path(NULL, path); } else { ret = btrfs_bin_search(upper->eb, key, upper->level, @@ -2053,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(upper->eb, slot); - if (!lowest) { - if (node->eb->start == bytenr) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - continue; - } + if (lowest) { + BUG_ON(bytenr != node->bytenr); } else { - BUG_ON(node->bytenr != bytenr); + if (node->eb->start == bytenr) + goto next; } blocksize = btrfs_level_size(root, node->level); @@ -2072,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, slot, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); if (ret < 0) { err = ret; - break; + goto next; } - btrfs_set_lock_blocking(eb); - node->eb = eb; - node->locked = 1; + BUG_ON(node->eb != eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); @@ -2096,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans, ret = btrfs_drop_subtree(trans, root, eb, upper->eb); BUG_ON(ret); } - if (!lowest) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - } +next: + if (!upper->pending) + drop_node_buffer(upper); + else + unlock_node_buffer(upper); + if (err) + break; } + + if (!err && node->pending) { + drop_node_buffer(node); + list_move_tail(&node->list, &rc->backref_cache.changed); + node->pending = 0; + } + path->lowest_level = 0; + BUG_ON(err == -ENOSPC); return err; } static int link_to_upper(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct backref_node *node, struct btrfs_path *path) { struct btrfs_key key; - if (!node->eb || list_empty(&node->upper)) - return 0; btrfs_node_key_to_cpu(node->eb, &key, 0); - return do_relocation(trans, node, &key, path, 0); + return do_relocation(trans, rc, node, &key, path, 0); } static int finish_pending_nodes(struct btrfs_trans_handle *trans, - struct backref_cache *cache, - struct btrfs_path *path) + struct reloc_control *rc, + struct btrfs_path *path, int err) { + LIST_HEAD(list); + struct backref_cache *cache = &rc->backref_cache; struct backref_node *node; int level; int ret; - int err = 0; for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { node = list_entry(cache->pending[level].next, - struct backref_node, lower); - BUG_ON(node->level != level); + struct backref_node, list); + list_move_tail(&node->list, &list); + BUG_ON(!node->pending); - ret = link_to_upper(trans, node, path); - if (ret < 0) - err = ret; - /* - * this remove the node from the pending list and - * may add some other nodes to the level + 1 - * pending list - */ - remove_backref_node(cache, node); + if (!err) { + ret = link_to_upper(trans, rc, node, path); + if (ret < 0) + err = ret; + } } + list_splice_init(&list, &cache->pending[level]); } - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); return err; } static void mark_block_processed(struct reloc_control *rc, - struct backref_node *node) + u64 bytenr, u32 blocksize) +{ + set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, + EXTENT_DIRTY, GFP_NOFS); +} + +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node) { u32 blocksize; if (node->level == 0 || in_block_group(node->bytenr, rc->block_group)) { blocksize = btrfs_level_size(rc->extent_root, node->level); - set_extent_bits(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, - GFP_NOFS); + mark_block_processed(rc, node->bytenr, blocksize); } node->processed = 1; } @@ -2179,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc, if (next->processed) break; - mark_block_processed(rc, next); + __mark_block_processed(rc, next); if (list_empty(&next->upper)) break; @@ -2202,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, return 0; } -/* - * check if there are any file extent pointers in the leaf point to - * data require processing - */ -static int check_file_extents(struct reloc_control *rc, - u64 bytenr, u32 blocksize, u64 ptr_gen) -{ - struct btrfs_key found_key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - u32 nritems; - int i; - int ret = 0; - - leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &found_key, i); - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (bytenr == 0) - continue; - if (in_block_group(bytenr, rc->block_group)) { - ret = 1; - break; - } - } - free_extent_buffer(leaf); - return ret; -} - -/* - * scan child blocks of a given block to find blocks require processing - */ -static int add_child_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct rb_root *blocks) -{ - struct tree_block *block; - struct rb_node *rb_node; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; - u32 nritems; - int i; - int err = 0; - - nritems = btrfs_header_nritems(node->eb); - blocksize = btrfs_level_size(rc->extent_root, node->level - 1); - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - - readahead_tree_block(rc->extent_root, - bytenr, blocksize, ptr_gen); - } - - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - if (!in_block_group(bytenr, rc->block_group) && - !check_file_extents(rc, bytenr, blocksize, ptr_gen)) - continue; - - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = bytenr; - btrfs_node_key_to_cpu(node->eb, &block->key, i); - block->level = node->level - 1; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); - } - if (err) - free_block_list(blocks); - return err; -} - -/* - * find adjacent blocks require processing - */ -static noinline_for_stack -int add_adjacent_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_cache *cache, - struct rb_root *blocks, int level, - struct backref_node **upper) -{ - struct backref_node *node; - int ret = 0; - - WARN_ON(!list_empty(&cache->pending[level])); - - if (list_empty(&cache->pending[level + 1])) - return 1; - - node = list_entry(cache->pending[level + 1].next, - struct backref_node, lower); - if (node->eb) - ret = add_child_blocks(trans, rc, node, blocks); - - *upper = node; - return ret; -} - static int get_tree_block_key(struct reloc_control *rc, struct tree_block *block) { @@ -2371,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_root *root; - int ret; + int release = 0; + int ret = 0; + if (!node) + return 0; + + BUG_ON(node->processed); root = select_one_root(trans, node); - if (unlikely(!root)) { - rc->found_old_snapshot = 1; + if (root == ERR_PTR(-ENOENT)) { update_processed_blocks(rc, node); - return 0; + goto out; } - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = do_relocation(trans, node, key, path, 1); - if (ret < 0) - goto out; - if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_file_extents(trans, rc, root, - node->eb, NULL); - if (ret < 0) - goto out; - } - drop_node_buffer(node); - } else if (!root->ref_cows) { - path->lowest_level = node->level; - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(root, path); - if (ret < 0) + if (!root || root->ref_cows) { + ret = reserve_metadata_space(trans, rc, node); + if (ret) goto out; - } else if (root != node->root) { - WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); + release = 1; } - update_processed_blocks(rc, node); - ret = 0; + if (root) { + if (root->ref_cows) { + BUG_ON(node->new_bytenr); + BUG_ON(!list_empty(&node->list)); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + node->new_bytenr = root->node->start; + node->root = root; + list_add_tail(&node->list, &rc->backref_cache.changed); + } else { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(root, path); + if (ret > 0) + ret = 0; + } + if (!ret) + update_processed_blocks(rc, node); + } else { + ret = do_relocation(trans, rc, node, key, path, 1); + } out: - drop_node_buffer(node); + if (ret || node->level == 0 || node->cowonly) { + if (release) + release_metadata_space(rc, node); + remove_backref_node(&rc->backref_cache, node); + } return ret; } @@ -2415,12 +2752,10 @@ static noinline_for_stack int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct rb_root *blocks) { - struct backref_cache *cache; struct backref_node *node; struct btrfs_path *path; struct tree_block *block; struct rb_node *rb_node; - int level = -1; int ret; int err = 0; @@ -2428,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - cache = kmalloc(sizeof(*cache), GFP_NOFS); - if (!cache) { - btrfs_free_path(path); - return -ENOMEM; - } - - backref_cache_init(cache); - rb_node = rb_first(blocks); while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - if (level == -1) - level = block->level; - else - BUG_ON(level != block->level); if (!block->key_ready) reada_tree_block(rc, block); rb_node = rb_next(rb_node); @@ -2460,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - node = build_backref_tree(rc, cache, &block->key, + node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -2470,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - err = ret; + if (ret != -EAGAIN || rb_node == rb_first(blocks)) + err = ret; goto out; } - remove_backref_node(cache, node); rb_node = rb_next(rb_node); } - - if (level > 0) - goto out; - +out: free_block_list(blocks); + err = finish_pending_nodes(trans, rc, path, err); - /* - * now backrefs of some upper level tree blocks have been cached, - * try relocating blocks referenced by these upper level blocks. - */ - while (1) { - struct backref_node *upper = NULL; - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - break; + btrfs_free_path(path); + return err; +} - ret = add_adjacent_blocks(trans, rc, cache, blocks, level, - &upper); - if (ret < 0) - err = ret; - if (ret != 0) - break; +static noinline_for_stack +int prealloc_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 alloc_hint = 0; + u64 start; + u64 end; + u64 offset = BTRFS_I(inode)->index_cnt; + u64 num_bytes; + int nr = 0; + int ret = 0; - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - goto out; - BUG_ON(!block->key_ready); - node = build_backref_tree(rc, cache, &block->key, - level, block->bytenr); - if (IS_ERR(node)) { - err = PTR_ERR(node); - goto out; - } + BUG_ON(cluster->start != cluster->boundary[0]); + mutex_lock(&inode->i_mutex); - ret = relocate_tree_block(trans, rc, node, - &block->key, path); - if (ret < 0) { - err = ret; - goto out; - } - remove_backref_node(cache, node); - rb_node = rb_next(rb_node); - } - free_block_list(blocks); + ret = btrfs_check_data_free_space(inode, cluster->end + + 1 - cluster->start); + if (ret) + goto out; - if (upper) { - ret = link_to_upper(trans, upper, path); - if (ret < 0) { - err = ret; - break; - } - remove_backref_node(cache, upper); - } + while (nr < cluster->nr) { + start = cluster->boundary[nr] - offset; + if (nr + 1 < cluster->nr) + end = cluster->boundary[nr + 1] - 1 - offset; + else + end = cluster->end - offset; + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + num_bytes = end + 1 - start; + ret = btrfs_prealloc_file_range(inode, 0, start, + num_bytes, num_bytes, + end + 1, &alloc_hint); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + if (ret) + break; + nr++; } + btrfs_free_reserved_data_space(inode, cluster->end + + 1 - cluster->start); out: - free_block_list(blocks); - - ret = finish_pending_nodes(trans, cache, path); - if (ret < 0) - err = ret; - - kfree(cache); - btrfs_free_path(path); - return err; + mutex_unlock(&inode->i_mutex); + return ret; } static noinline_for_stack @@ -2588,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode, u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; - unsigned int dirty_page = 0; struct page *page; struct file_ra_state *ra; int nr = 0; @@ -2601,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!ra) return -ENOMEM; - index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; - last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + ret = prealloc_file_extent_cluster(inode, cluster); + if (ret) + goto out; - mutex_lock(&inode->i_mutex); + file_ra_state_init(ra, inode->i_mapping); - i_size_write(inode, cluster->end + 1 - offset); ret = setup_extent_mapping(inode, cluster->start - offset, cluster->end - offset, cluster->start); if (ret) - goto out_unlock; - - file_ra_state_init(ra, inode->i_mapping); + goto out; - WARN_ON(cluster->start != cluster->boundary[0]); + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { + ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + page = find_lock_page(inode->i_mapping, index); if (!page) { page_cache_sync_readahead(inode->i_mapping, @@ -2623,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode, last_index + 1 - index); page = grab_cache_page(inode->i_mapping, index); if (!page) { + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); ret = -ENOMEM; - goto out_unlock; + goto out; } } @@ -2640,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); ret = -EIO; - goto out_unlock; + goto out; } } @@ -2660,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode, EXTENT_BOUNDARY, GFP_NOFS); nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); set_page_dirty(page); - dirty_page++; unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); @@ -2671,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode, page_cache_release(page); index++; - if (nr < cluster->nr && - page_end + 1 + offset == cluster->boundary[nr]) { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_page); - dirty_page = 0; - } - } - if (dirty_page) { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_page); + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(BTRFS_I(inode)->root); } WARN_ON(nr != cluster->nr); -out_unlock: - mutex_unlock(&inode->i_mutex); +out: kfree(ra); return ret; } @@ -2870,9 +3172,6 @@ out: static int block_use_full_backref(struct reloc_control *rc, struct extent_buffer *eb) { - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct btrfs_key key; u64 flags; int ret; @@ -2880,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc, btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) return 1; - path = btrfs_alloc_path(); - BUG_ON(!path); - - key.objectid = eb->start; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = eb->len; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, - &key, path, 0, 0); + ret = btrfs_lookup_extent_info(NULL, rc->extent_root, + eb->start, eb->len, NULL, &flags); BUG_ON(ret); - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_extent_item); - flags = btrfs_extent_flags(path->nodes[0], ei); - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) ret = 1; else ret = 0; - btrfs_free_path(path); return ret; } @@ -3074,22 +3359,10 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_inline_ref *iref; unsigned long ptr; unsigned long end; - u32 blocksize; + u32 blocksize = btrfs_level_size(rc->extent_root, 0); int ret; int err = 0; - ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, - extent_key->offset); - BUG_ON(ret < 0); - if (ret > 0) { - /* the relocated data is fragmented */ - rc->extents_skipped++; - btrfs_release_path(rc->extent_root, path); - return 0; - } - - blocksize = btrfs_level_size(rc->extent_root, 0); - eb = path->nodes[0]; ptr = btrfs_item_ptr_offset(eb, path->slots[0]); end = ptr + btrfs_item_size_nr(eb, path->slots[0]); @@ -3170,7 +3443,8 @@ int add_data_references(struct reloc_control *rc, */ static noinline_for_stack int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path) + struct reloc_control *rc, struct btrfs_path *path, + struct btrfs_key *extent_key) { struct btrfs_key key; struct extent_buffer *leaf; @@ -3225,6 +3499,7 @@ next: rc->search_start = end + 1; } else { rc->search_start = key.objectid + key.offset; + memcpy(extent_key, &key, sizeof(key)); return 0; } } @@ -3262,12 +3537,49 @@ static int check_extent_flags(u64 flags) return 0; } +static noinline_for_stack +int prepare_to_relocate(struct reloc_control *rc) +{ + struct btrfs_trans_handle *trans; + int ret; + + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); + if (!rc->block_rsv) + return -ENOMEM; + + /* + * reserve some space for creating reloc trees. + * btrfs_init_reloc_root will use them when there + * is no reservation in transaction handle. + */ + ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + rc->extent_root->nodesize * 256, + &rc->block_rsv_retries); + if (ret) + return ret; + + rc->block_rsv->refill_used = 1; + btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); + + memset(&rc->cluster, 0, sizeof(rc->cluster)); + rc->search_start = rc->block_group->key.objectid; + rc->extents_found = 0; + rc->nodes_relocated = 0; + rc->merging_rsv_size = 0; + rc->block_rsv_retries = 0; + + rc->create_reloc_tree = 1; + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + return 0; +} static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; struct btrfs_key key; - struct file_extent_cluster *cluster; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -3277,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) int ret; int err = 0; - cluster = kzalloc(sizeof(*cluster), GFP_NOFS); - if (!cluster) - return -ENOMEM; - path = btrfs_alloc_path(); - if (!path) { - kfree(cluster); + if (!path) return -ENOMEM; - } - - rc->extents_found = 0; - rc->extents_skipped = 0; - - rc->search_start = rc->block_group->key.objectid; - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); - - rc->create_reloc_root = 1; - set_reloc_control(rc); - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + ret = prepare_to_relocate(rc); + if (ret) { + err = ret; + goto out_free; + } while (1) { - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_start_transaction(rc->extent_root, 0); + + if (update_backref_cache(trans, &rc->backref_cache)) { + btrfs_end_transaction(trans, rc->extent_root); + continue; + } - ret = find_next_extent(trans, rc, path); + ret = find_next_extent(trans, rc, path, &key); if (ret < 0) err = ret; if (ret != 0) @@ -3313,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - item_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (item_size >= sizeof(*ei)) { flags = btrfs_extent_flags(path->nodes[0], ei); ret = check_extent_flags(flags); @@ -3356,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && - (flags & BTRFS_EXTENT_FLAG_DATA)) { + (flags & BTRFS_EXTENT_FLAG_DATA)) { ret = add_data_references(rc, &key, path, &blocks); } else { btrfs_release_path(rc->extent_root, path); ret = 0; } if (ret < 0) { - err = 0; + err = ret; break; } if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { + if (ret != -EAGAIN) { + err = ret; + break; + } + rc->extents_found--; + rc->search_start = key.objectid; + } + } + + ret = btrfs_block_rsv_check(trans, rc->extent_root, + rc->block_rsv, 0, 5); + if (ret < 0) { + if (ret != -EAGAIN) { err = ret; + WARN_ON(1); break; } + rc->commit_transaction = 1; } - nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); + if (rc->commit_transaction) { + rc->commit_transaction = 0; + ret = btrfs_commit_transaction(trans, rc->extent_root); + BUG_ON(ret); + } else { + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } trans = NULL; - btrfs_btree_balance_dirty(rc->extent_root, nr); if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = 1; ret = relocate_data_extent(rc->data_inode, - &key, cluster); + &key, &rc->cluster); if (ret < 0) { err = ret; break; } } } - btrfs_free_path(path); + + btrfs_release_path(rc->extent_root, path); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); if (trans) { nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); + btrfs_end_transaction_throttle(trans, rc->extent_root); btrfs_btree_balance_dirty(rc->extent_root, nr); } if (!err) { - ret = relocate_file_extent_cluster(rc->data_inode, cluster); + ret = relocate_file_extent_cluster(rc->data_inode, + &rc->cluster); if (ret < 0) err = ret; } - kfree(cluster); + rc->create_reloc_tree = 0; + set_reloc_control(rc); - rc->create_reloc_root = 0; - smp_mb(); + backref_cache_cleanup(&rc->backref_cache); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); - if (rc->extents_found > 0) { - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - } + err = prepare_to_merge(rc, err); merge_reloc_roots(rc); + rc->merge_reloc_tree = 0; unset_reloc_control(rc); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); /* get rid of pinned extents */ - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); - +out_free: + btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); + btrfs_free_path(path); return err; } @@ -3448,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(root, path); out: @@ -3460,8 +3790,9 @@ out: * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ -static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) +static noinline_for_stack +struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -3475,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, if (IS_ERR(root)) return ERR_CAST(root); - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return ERR_CAST(trans); err = btrfs_find_free_objectid(trans, root, objectid, &objectid); if (err) @@ -3496,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, out: nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); if (err) { if (inode) @@ -3506,6 +3837,21 @@ out: return inode; } +static struct reloc_control *alloc_reloc_control(void) +{ + struct reloc_control *rc; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return NULL; + + INIT_LIST_HEAD(&rc->reloc_roots); + backref_cache_init(&rc->backref_cache); + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); + return rc; +} + /* * function to relocate all extents in a block group. */ @@ -3514,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) struct btrfs_fs_info *fs_info = extent_root->fs_info; struct reloc_control *rc; int ret; + int rw = 0; int err = 0; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = alloc_reloc_control(); if (!rc) return -ENOMEM; - mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); - INIT_LIST_HEAD(&rc->reloc_roots); + rc->extent_root = extent_root; rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size, NULL); - - rc->extent_root = extent_root; - btrfs_prepare_block_group_relocation(extent_root, rc->block_group); + if (!rc->block_group->ro) { + ret = btrfs_set_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; + } + rw = 1; + } rc->data_inode = create_reloc_inode(fs_info, rc->block_group); if (IS_ERR(rc->data_inode)) { @@ -3548,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); while (1) { - rc->extents_found = 0; - rc->extents_skipped = 0; - mutex_lock(&fs_info->cleaner_mutex); btrfs_clean_old_snapshots(fs_info->tree_root); @@ -3559,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; - break; + goto out; } if (rc->extents_found == 0) @@ -3573,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; - } else if (rc->stage == UPDATE_DATA_PTRS && - rc->extents_skipped >= rc->extents_found) { - iput(rc->data_inode); - rc->data_inode = create_reloc_inode(fs_info, - rc->block_group); - if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - break; - } - rc->stage = MOVE_DATA_EXTENTS; - rc->found_file_extent = 0; } } @@ -3597,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) WARN_ON(rc->block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); out: + if (err && rw) + btrfs_set_block_group_rw(extent_root, rc->block_group); iput(rc->data_inode); - btrfs_stop_workers(&rc->workers); btrfs_put_block_group(rc->block_group); kfree(rc); return err; @@ -3609,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - trans = btrfs_start_transaction(root->fs_info->tree_root, 1); + trans = btrfs_start_transaction(root->fs_info->tree_root, 0); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -3702,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = alloc_reloc_control(); if (!rc) { err = -ENOMEM; goto out; } - mapping_tree_init(&rc->reloc_root_tree); - INIT_LIST_HEAD(&rc->reloc_roots); - btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); + trans = btrfs_join_transaction(rc->extent_root, 1); + + rc->merge_reloc_tree = 1; + while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); @@ -3735,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root->reloc_root = reloc_root; } - trans = btrfs_start_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); merge_reloc_roots(rc); unset_reloc_control(rc); - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); out: - if (rc) { - btrfs_stop_workers(&rc->workers); - kfree(rc); - } + kfree(rc); while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); @@ -3814,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) btrfs_put_ordered_extent(ordered); return 0; } + +void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow) +{ + struct reloc_control *rc; + struct backref_node *node; + int first_cow = 0; + int level; + int ret; + + rc = root->fs_info->reloc_ctl; + if (!rc) + return; + + BUG_ON(rc->stage == UPDATE_DATA_PTRS && + root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + + level = btrfs_header_level(buf); + if (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item)) + first_cow = 1; + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && + rc->create_reloc_tree) { + WARN_ON(!first_cow && level == 0); + + node = rc->backref_cache.path[level]; + BUG_ON(node->bytenr != buf->start && + node->new_bytenr != buf->start); + + drop_node_buffer(node); + extent_buffer_get(cow); + node->eb = cow; + node->new_bytenr = cow->start; + + if (!node->pending) { + list_move_tail(&node->list, + &rc->backref_cache.pending[level]); + node->pending = 1; + } + + if (first_cow) + __mark_block_processed(rc, node); + + if (first_cow && level > 0) + rc->nodes_relocated += buf->len; + } + + if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_file_extents(trans, rc, root, cow); + BUG_ON(ret); + } +} + +/* + * called before creating snapshot. it calculates metadata reservation + * requried for relocating tree blocks in the snapshot + */ +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct reloc_control *rc; + + root = pending->root; + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + if (!rc->merge_reloc_tree) + return; + + root = root->reloc_root; + BUG_ON(btrfs_root_refs(&root->root_item) == 0); + /* + * relocation is in the stage of merging trees. the space + * used by merging a reloc tree is twice the size of + * relocated tree nodes in the worst case. half for cowing + * the reloc tree, half for cowing the fs tree. the space + * used by cowing the reloc tree will be freed after the + * tree is dropped. if we create snapshot, cowing the fs + * tree may use more space than it frees. so we need + * reserve extra space. + */ + *bytes_to_reserve += rc->nodes_relocated; +} + +/* + * called after snapshot is created. migrate block reservation + * and create reloc root for the newly created snapshot + */ +void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *reloc_root; + struct btrfs_root *new_root; + struct reloc_control *rc; + int ret; + + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + rc->merging_rsv_size += rc->nodes_relocated; + + if (rc->merge_reloc_tree) { + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + rc->block_rsv, + rc->nodes_relocated); + BUG_ON(ret); + } + + new_root = pending->snap; + reloc_root = create_reloc_root(trans, root->reloc_root, + new_root->root_key.objectid); + + __add_reloc_root(reloc_root); + new_root->reloc_root = reloc_root; + + if (rc->create_reloc_tree) { + ret = clone_backref_node(trans, rc, root, reloc_root); + BUG_ON(ret); + } +} diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 67fa2d2..b91ccd9 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_key key; + struct btrfs_key root_key; + struct btrfs_root *root; int err = 0; int ret; @@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = 0; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + while (1) { ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); if (ret < 0) { @@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.type != BTRFS_ORPHAN_ITEM_KEY) break; - ret = btrfs_find_dead_roots(tree_root, key.offset); - if (ret) { + root_key.objectid = key.offset; + key.offset++; + + root = btrfs_read_fs_root_no_name(tree_root->fs_info, + &root_key); + if (!IS_ERR(root)) + continue; + + ret = PTR_ERR(root); + if (ret != -ENOENT) { err = ret; break; } - key.offset++; + ret = btrfs_find_dead_roots(tree_root, root_key.objectid); + if (ret) { + err = ret; + break; + } } btrfs_free_path(path); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1866dff..d34b2df 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -498,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_start_delalloc_inodes(root, 0); btrfs_wait_ordered_extents(root, 0, 0); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_commit_transaction(trans, root); return ret; } @@ -694,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) return -EINVAL; - /* recover relocation */ - ret = btrfs_recover_relocation(root); + ret = btrfs_cleanup_fs_roots(root->fs_info); WARN_ON(ret); - ret = btrfs_cleanup_fs_roots(root->fs_info); + /* recover relocation */ + ret = btrfs_recover_relocation(root); WARN_ON(ret); sb->s_flags &= ~MS_RDONLY; @@ -714,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 data_used = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_DUP| - BTRFS_BLOCK_GROUP_RAID10| - BTRFS_BLOCK_GROUP_RAID1)) { - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; - } - - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; - } + list_for_each_entry_rcu(found, head, list) + total_used += found->disk_used; rcu_read_unlock(); buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (data_used >> bits); + buf->f_bavail = buf->f_bfree; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; @@ -832,11 +816,14 @@ static const struct file_operations btrfs_ctl_fops = { }; static struct miscdevice btrfs_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = BTRFS_MINOR, .name = "btrfs-control", .fops = &btrfs_ctl_fops }; +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + static int btrfs_interface_init(void) { return misc_register(&btrfs_misc); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2cb1160..66e4c66 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -165,54 +165,89 @@ enum btrfs_trans_type { TRANS_USERSPACE, }; +static int may_wait_transaction(struct btrfs_root *root, int type) +{ + if (!root->fs_info->log_root_recovering && + ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || + type == TRANS_USERSPACE)) + return 1; + return 0; +} + static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - int num_blocks, int type) + u64 num_items, int type) { - struct btrfs_trans_handle *h = - kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + struct btrfs_trans_handle *h; + struct btrfs_transaction *cur_trans; + int retries = 0; int ret; +again: + h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + if (!h) + return ERR_PTR(-ENOMEM); mutex_lock(&root->fs_info->trans_mutex); - if (!root->fs_info->log_root_recovering && - ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || - type == TRANS_USERSPACE)) + if (may_wait_transaction(root, type)) wait_current_trans(root); + ret = join_transaction(root); BUG_ON(ret); - h->transid = root->fs_info->running_transaction->transid; - h->transaction = root->fs_info->running_transaction; - h->blocks_reserved = num_blocks; + cur_trans = root->fs_info->running_transaction; + cur_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + h->transid = cur_trans->transid; + h->transaction = cur_trans; h->blocks_used = 0; h->block_group = 0; - h->alloc_exclude_nr = 0; - h->alloc_exclude_start = 0; + h->bytes_reserved = 0; h->delayed_ref_updates = 0; + h->block_rsv = NULL; - if (!current->journal_info && type != TRANS_USERSPACE) - current->journal_info = h; + smp_mb(); + if (cur_trans->blocked && may_wait_transaction(root, type)) { + btrfs_commit_transaction(h, root); + goto again; + } + + if (num_items > 0) { + ret = btrfs_trans_reserve_metadata(h, root, num_items, + &retries); + if (ret == -EAGAIN) { + btrfs_commit_transaction(h, root); + goto again; + } + if (ret < 0) { + btrfs_end_transaction(h, root); + return ERR_PTR(ret); + } + } - root->fs_info->running_transaction->use_count++; + mutex_lock(&root->fs_info->trans_mutex); record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); + + if (!current->journal_info && type != TRANS_USERSPACE) + current->journal_info = h; return h; } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_blocks) + int num_items) { - return start_transaction(root, num_blocks, TRANS_START); + return start_transaction(root, num_items, TRANS_START); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, TRANS_JOIN); + return start_transaction(root, 0, TRANS_JOIN); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks) { - return start_transaction(r, num_blocks, TRANS_USERSPACE); + return start_transaction(r, 0, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ @@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root) mutex_unlock(&root->fs_info->trans_mutex); } +static int should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + ret = btrfs_block_rsv_check(trans, root, + &root->fs_info->global_block_rsv, 0, 5); + return ret ? 1 : 0; +} + +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + int updates; + + if (cur_trans->blocked || cur_trans->delayed_refs.flushing) + return 1; + + updates = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (updates) + btrfs_run_delayed_refs(trans, root, updates); + + return should_end_transaction(trans, root); +} + static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int throttle) { - struct btrfs_transaction *cur_trans; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; int count = 0; @@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } + btrfs_trans_release_metadata(trans, root); + + if (!root->fs_info->open_ioctl_trans && + should_end_transaction(trans, root)) + trans->transaction->blocked = 1; + + if (cur_trans->blocked && !cur_trans->in_commit) { + if (throttle) + return btrfs_commit_transaction(trans, root); + else + wake_up_process(info->transaction_kthread); + } + mutex_lock(&info->trans_mutex); - cur_trans = info->running_transaction; - WARN_ON(cur_trans != trans->transaction); + WARN_ON(cur_trans != info->running_transaction); WARN_ON(cur_trans->num_writers < 1); cur_trans->num_writers--; @@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); + btrfs_orphan_commit_root(trans, root); if (root->commit_root != root->node) { switch_commit_root(root); @@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) { struct btrfs_fs_info *info = root->fs_info; - int ret; struct btrfs_trans_handle *trans; + int ret; unsigned long nr; - smp_mb(); - if (root->defrag_running) + if (xchg(&root->defrag_running, 1)) return 0; - trans = btrfs_start_transaction(root, 1); + while (1) { - root->defrag_running = 1; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_defrag_leaves(trans, root, cacheonly); + nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(info->tree_root, nr); cond_resched(); - trans = btrfs_start_transaction(root, 1); if (root->fs_info->closing || ret != -EAGAIN) break; } root->defrag_running = 0; - smp_mb(); - btrfs_end_transaction(trans, root); - return 0; + return ret; } #if 0 @@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct inode *parent_inode; + struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; int ret; - u64 objectid; - int namelen; + int retries = 0; + u64 to_reserve = 0; u64 index = 0; - - parent_inode = pending->dentry->d_parent->d_inode; - parent_root = BTRFS_I(parent_inode)->root; + u64 objectid; new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - ret = -ENOMEM; + pending->error = -ENOMEM; goto fail; } + ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); - if (ret) + if (ret) { + pending->error = ret; goto fail; + } + + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); + + if (to_reserve > 0) { + ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, + to_reserve, &retries); + if (ret) { + pending->error = ret; + goto fail; + } + } key.objectid = objectid; - /* record when the snapshot was created in key.offset */ - key.offset = trans->transid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; - memcpy(&pending->root_key, &key, sizeof(key)); - pending->root_key.offset = (u64)-1; + trans->block_rsv = &pending->block_rsv; + dentry = pending->dentry; + parent_inode = dentry->d_parent->d_inode; + parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); + /* * insert the directory item */ - namelen = strlen(pending->name); ret = btrfs_set_inode_index(parent_inode, &index); BUG_ON(ret); ret = btrfs_insert_dir_item(trans, parent_root, - pending->name, namelen, - parent_inode->i_ino, - &pending->root_key, BTRFS_FT_DIR, index); + dentry->d_name.name, dentry->d_name.len, + parent_inode->i_ino, &key, + BTRFS_FT_DIR, index); BUG_ON(ret); - btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); + btrfs_i_size_write(parent_inode, parent_inode->i_size + + dentry->d_name.len * 2); ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); @@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, free_extent_buffer(old); btrfs_set_root_node(new_root_item, tmp); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - new_root_item); - BUG_ON(ret); + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; + ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); + BUG_ON(ret); - ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, - pending->root_key.objectid, + /* + * insert root back/forward references + */ + ret = btrfs_add_root_ref(trans, tree_root, objectid, parent_root->root_key.objectid, - parent_inode->i_ino, index, pending->name, - namelen); + parent_inode->i_ino, index, + dentry->d_name.name, dentry->d_name.len); BUG_ON(ret); + key.offset = (u64)-1; + pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(IS_ERR(pending->snap)); + + btrfs_reloc_post_snapshot(trans, pending); + btrfs_orphan_post_snapshot(trans, pending); fail: kfree(new_root_item); - return ret; + btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); + return 0; } /* @@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info) return ret; } +int btrfs_transaction_blocked(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->new_trans_lock); + if (info->running_transaction) + ret = info->running_transaction->blocked; + spin_unlock(&info->new_trans_lock); + return ret; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); + btrfs_trans_release_metadata(trans, root); + cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to @@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - prepare_to_wait(&cur_trans->writer_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (cur_trans->num_writers > 1) timeout = MAX_SCHEDULE_TIMEOUT; else if (should_grow) @@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_run_ordered_operations(root, 1); + prepare_to_wait(&cur_trans->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + smp_mb(); if (cur_trans->num_writers > 1 || should_grow) schedule_timeout(timeout); @@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, 0); + btrfs_drop_snapshot(root, NULL, 0); else - btrfs_drop_snapshot(root, 1); + btrfs_drop_snapshot(root, NULL, 1); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 93c7ccb..e104986 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -45,20 +45,23 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; + u64 block_group; + u64 bytes_reserved; unsigned long blocks_reserved; unsigned long blocks_used; - struct btrfs_transaction *transaction; - u64 block_group; - u64 alloc_exclude_start; - u64 alloc_exclude_nr; unsigned long delayed_ref_updates; + struct btrfs_transaction *transaction; + struct btrfs_block_rsv *block_rsv; }; struct btrfs_pending_snapshot { struct dentry *dentry; struct btrfs_root *root; - char *name; - struct btrfs_key root_key; + struct btrfs_root *snap; + /* block reservation for the operation */ + struct btrfs_block_rsv block_rsv; + /* extra metadata reseration for relocation */ + int error; struct list_head list; }; @@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_blocks); + int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks); + int num_blocks); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks); + int num_blocks); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, @@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); +int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b10eacd..f7ac8e0 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, path->nodes[1], 0, cache_only, &last_ret, &root->defrag_progress); - WARN_ON(ret && ret != -EAGAIN); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; } - - btrfs_release_path(root, path); out: if (path) btrfs_free_path(path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index af57dd2..fb102a9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; + int err = 0; mutex_lock(&root->log_mutex); if (root->log_root) { @@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); - BUG_ON(ret); + if (ret) + err = ret; } - if (!root->log_root) { + if (err == 0 && !root->log_root) { ret = btrfs_add_log_tree(trans, root); - BUG_ON(ret); + if (ret) + err = ret; } mutex_unlock(&root->fs_info->tree_log_mutex); root->log_batch++; atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); - return 0; + return err; } /* @@ -376,7 +379,7 @@ insert: BUG_ON(ret); } } else if (ret) { - BUG(); + return ret; } dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); @@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(root, bytenr, blocksize); - wc->process_func(root, next, wc, ptr_gen); - if (*level == 1) { + wc->process_func(root, next, wc, ptr_gen); + path->slots[*level]++; if (wc->free) { btrfs_read_buffer(next, ptr_gen); @@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - bytenr = path->nodes[*level]->start; - - blocksize = btrfs_level_size(root, *level); - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - wc->process_func(root, path->nodes[*level], wc, - btrfs_header_generation(path->nodes[*level])); - - if (wc->free) { - next = path->nodes[*level]; - btrfs_tree_lock(next); - clean_tree_block(trans, root, next); - btrfs_set_lock_blocking(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); - } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level += 1; + path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); return 0; @@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { struct extent_buffer *node; node = path->nodes[i]; path->slots[i]++; @@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); - BUG_ON(ret); mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { @@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, wake_up(&log_root_tree->log_writer_wait); } + if (ret) { + BUG_ON(ret != -ENOSPC); + root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out; + } + index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); @@ -2129,15 +2112,10 @@ out: return 0; } -/* - * free all the extents used by the tree log. This should be called - * at commit time of the full transaction - */ -int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +static void free_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log) { int ret; - struct btrfs_root *log; - struct key; u64 start; u64 end; struct walk_control wc = { @@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) .process_func = process_one_buffer }; - if (!root->log_root || root->fs_info->log_root_recovering) - return 0; - - log = root->log_root; ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); @@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } - if (log->log_transid > 0) { - ret = btrfs_del_root(trans, root->fs_info->log_root_tree, - &log->root_key); - BUG_ON(ret); - } - root->log_root = NULL; free_extent_buffer(log->node); kfree(log); +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + if (root->log_root) { + free_log_tree(trans, root->log_root); + root->log_root = NULL; + } + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + if (fs_info->log_root_tree) { + free_log_tree(trans, fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + } return 0; } @@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di; struct btrfs_path *path; int ret; + int err = 0; int bytes_del = 0; if (BTRFS_I(dir)->logged_trans < trans->transid) @@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, name, name_len, -1); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; BUG_ON(ret); @@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, index, name, name_len, -1); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; BUG_ON(ret); @@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto fail; + } if (ret == 0) { struct btrfs_inode_item *item; u64 i_size; @@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, ret = 0; btrfs_release_path(log, path); } - +fail: btrfs_free_path(path); mutex_unlock(&BTRFS_I(dir)->log_mutex); + if (ret == -ENOSPC) { + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 0; + } btrfs_end_log_trans(root); return 0; @@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); + if (ret == -ENOSPC) { + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 0; + } btrfs_end_log_trans(root); return ret; @@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, else key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - BUG_ON(ret); + if (ret) + return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); @@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src; + int err = 0; int ret; int i; int nritems; @@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); + if (ret) { + err = ret; + goto done; + } } } btrfs_release_path(root, path); @@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, goto done; ret = overwrite_item(trans, log, dst_path, src, i, &min_key); - BUG_ON(ret); + if (ret) { + err = ret; + goto done; + } } path->slots[0] = nritems; @@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); - - BUG_ON(ret); - last_offset = tmp.offset; + if (ret) + err = ret; + else + last_offset = tmp.offset; goto done; } } done: - *last_offset_ret = last_offset; btrfs_release_path(root, path); btrfs_release_path(log, dst_path); - /* insert the log range keys to indicate where the log is valid */ - ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, - first_offset, last_offset); - BUG_ON(ret); - return 0; + if (err == 0) { + *last_offset_ret = last_offset; + /* + * insert the log range keys to indicate where the log + * is valid + */ + ret = insert_dir_log_key(trans, log, path, key_type, + inode->i_ino, first_offset, + last_offset); + if (ret) + err = ret; + } + return err; } /* @@ -2501,7 +2529,8 @@ again: ret = log_dir_items(trans, root, inode, path, dst_path, key_type, min_key, &max_key); - BUG_ON(ret); + if (ret) + return ret; if (max_key == (u64)-1) break; min_key = max_key + 1; @@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - - if (ret != 1) + BUG_ON(ret == 0); + if (ret < 0) break; if (path->slots[0] == 0) @@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); } btrfs_release_path(log, path); - return 0; + return ret; } static noinline int copy_items(struct btrfs_trans_handle *trans, @@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, } ret = btrfs_insert_empty_items(trans, log, dst_path, ins_keys, ins_sizes, nr); - BUG_ON(ret); + if (ret) { + kfree(ins_data); + return ret; + } for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], @@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ + ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); - ret = btrfs_csum_file_blocks(trans, log, sums); - BUG_ON(ret); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); list_del(&sums->list); kfree(sums); } - return 0; + return ret; } /* log a single inode in the tree log. @@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; u32 size; + int err = 0; int ret; int nritems; int ins_start_slot = 0; @@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } path->keep_locks = 1; while (1) { @@ -2768,7 +2805,10 @@ again: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 1; ins_start_slot = path->slots[0]; next_slot: @@ -2784,7 +2824,10 @@ next_slot: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 0; } btrfs_release_path(root, path); @@ -2802,7 +2845,10 @@ next_slot: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 0; } WARN_ON(ins_nr); @@ -2810,14 +2856,18 @@ next_slot: btrfs_release_path(root, path); btrfs_release_path(log, dst_path); ret = log_directory_changes(trans, root, inode, path, dst_path); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } } BTRFS_I(inode)->logged_trans = trans->transid; +out_unlock: mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); btrfs_free_path(dst_path); - return 0; + return err; } /* @@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - start_log_trans(trans, root); + ret = start_log_trans(trans, root); + if (ret) + goto end_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); + if (ret) + goto end_trans; /* * for regular files, if its inode is already on disk, we don't @@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ if (S_ISREG(inode->i_mode) && BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto no_parent; + BTRFS_I(inode)->last_unlink_trans <= last_committed) { + ret = 0; + goto end_trans; + } inode_only = LOG_INODE_EXISTS; while (1) { @@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); + if (ret) + goto end_trans; } if (IS_ROOT(parent)) break; parent = parent->d_parent; } -no_parent: ret = 0; +end_trans: + if (ret < 0) { + BUG_ON(ret != -ENOSPC); + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 1; + } btrfs_end_log_trans(root); end_no_trans: return ret; @@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) path = btrfs_alloc_path(); BUG_ON(!path); - trans = btrfs_start_transaction(fs_info->tree_root, 1); + trans = btrfs_start_transaction(fs_info->tree_root, 0); wc.trans = trans; wc.pin = 1; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 0776eac..3dfae84 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -25,6 +25,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8db7b14..d6e3af8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); lock_chunks(root); device->barriers = 1; @@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, /* step one, relocate all the extents inside this chunk */ ret = btrfs_relocate_block_group(extent_root, chunk_offset); - BUG_ON(ret); + if (ret) + return ret; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); BUG_ON(!trans); lock_chunks(root); @@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root) break; BUG_ON(ret); - trans = btrfs_start_transaction(dev_root, 1); + trans = btrfs_start_transaction(dev_root, 0); BUG_ON(!trans); ret = btrfs_grow_device(trans, device, old_size); @@ -2094,11 +2095,7 @@ again: } /* Shrinking succeeded, else we would be at "done". */ - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto done; - } + trans = btrfs_start_transaction(root, 0); lock_chunks(root); device->disk_total_bytes = new_size; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 59acd3e..88ecbb2 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (trans) return do_setxattr(trans, inode, name, value, size, flags); - ret = btrfs_reserve_metadata_space(root, 2); - if (ret) - return ret; + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } btrfs_set_trans_block_group(trans, inode); ret = do_setxattr(trans, inode, name, value, size, flags); @@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, BUG_ON(ret); out: btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 2); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index e8aa708..d54812b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, } /* - * block_write_begin takes care of the basic task of block allocation and - * bringing partial write blocks uptodate first. - * - * If *pagep is not NULL, then block_write_begin uses the locked page - * at *pagep rather than allocating its own. In this case, the page will - * not be unlocked or deallocated on failure. + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct file *file, struct address_space *mapping, +int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); } } out: return status; } +EXPORT_SYMBOL(block_write_begin_newtrunc); + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Filesystems which pass down their own page also cannot + * call into vmtruncate here because it would lead to lock + * inversion problems (*pagep is locked). This is a further + * example of where the old truncate sequence is inadequate. + */ + if (unlikely(ret) && *pagep == NULL) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, @@ -2324,7 +2351,7 @@ out: * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, +int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block, loff_t *bytes) @@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } *pagep = NULL; - err = block_write_begin(file, mapping, pos, len, + err = block_write_begin_newtrunc(file, mapping, pos, len, flags, pagep, fsdata, get_block); out: return err; } +EXPORT_SYMBOL(cont_write_begin_newtrunc); + +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + int ret; + + ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block, bytes); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(cont_write_begin); int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write); * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the + * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. @@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) } /* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int nobh_write_begin(struct file *file, struct address_space *mapping, +int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, - fsdata, get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, + flags, pagep, fsdata, get_block); } if (PageMappedToDisk(page)) @@ -2605,8 +2652,34 @@ out_release: page_cache_release(page); *pagep = NULL; - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + return ret; +} +EXPORT_SYMBOL(nobh_write_begin_newtrunc); + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } return ret; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a9005d8..d9c60b8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; int rc = 0; struct page **pages; - struct pagevec pvec; loff_t offset; u64 len; @@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc < 0) goto out; - /* set uptodate and add to lru in pagevec-sized chunks */ - pagevec_init(&pvec, 0); for (; !list_empty(page_list) && len > 0; rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { struct page *page = @@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, zero_user_segment(page, s, PAGE_CACHE_SIZE); } - if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { page_cache_release(page); dout("readpages %p add_to_page_cache failed %p\n", inode, page); @@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); - if (pagevec_add(&pvec, page) == 0) - pagevec_lru_add_file(&pvec); /* add to lru */ + page_cache_release(page); } - pagevec_lru_add_file(&pvec); rc = 0; out: @@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req, ceph_release_pages(req->r_pages, req->r_num_pages); if (req->r_pages_from_pool) mempool_free(req->r_pages, - ceph_client(inode->i_sb)->wb_pagevec_pool); + ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else kfree(req->r_pages); ceph_osdc_put_request(req); diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index 818afe7..89490bea 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -1,7 +1,6 @@ #include "ceph_debug.h" #include <linux/module.h> -#include <linux/slab.h> #include <linux/err.h> #include <linux/slab.h> @@ -150,7 +149,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac, ret = ac->ops->build_request(ac, p + sizeof(u32), end); if (ret < 0) { - pr_err("error %d building request\n", ret); + pr_err("error %d building auth method %s request\n", ret, + ac->ops->name); return ret; } dout(" built request %d bytes\n", ret); @@ -229,7 +229,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, if (ret == -EAGAIN) { return ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { - pr_err("authentication error %d\n", ret); + pr_err("auth method '%s' error %d\n", ac->ops->name, ret); return ret; } return 0; @@ -246,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac, if (!ac->protocol) return ceph_auth_build_hello(ac, msg_buf, msg_len); BUG_ON(!ac->ops); - if (!ac->ops->is_authenticated(ac)) + if (ac->ops->should_authenticate(ac)) return ceph_build_auth_request(ac, msg_buf, msg_len); return 0; } diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h index ca4f57c..d38a2fb 100644 --- a/fs/ceph/auth.h +++ b/fs/ceph/auth.h @@ -15,6 +15,8 @@ struct ceph_auth_client; struct ceph_authorizer; struct ceph_auth_client_ops { + const char *name; + /* * true if we are authenticated and can connect to * services. @@ -22,6 +24,12 @@ struct ceph_auth_client_ops { int (*is_authenticated)(struct ceph_auth_client *ac); /* + * true if we should (re)authenticate, e.g., when our tickets + * are getting old and crusty. + */ + int (*should_authenticate)(struct ceph_auth_client *ac); + + /* * build requests and process replies during monitor * handshake. if handle_reply returns -EAGAIN, we build * another request. diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c index 8cd9e3a..ad1dc21 100644 --- a/fs/ceph/auth_none.c +++ b/fs/ceph/auth_none.c @@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac) return !xi->starting; } +static int should_authenticate(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + return xi->starting; +} + /* * the generic auth code decode the global_id, and we carry no actual * authenticate state, so nothing happens here. @@ -94,9 +101,11 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, } static const struct ceph_auth_client_ops ceph_auth_none_ops = { + .name = "none", .reset = reset, .destroy = destroy, .is_authenticated = is_authenticated, + .should_authenticate = should_authenticate, .handle_reply = handle_reply, .create_authorizer = ceph_auth_none_create_authorizer, .destroy_authorizer = ceph_auth_none_destroy_authorizer, diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index fee5a08d..83d4d27 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac) return (ac->want_keys & xi->have_keys) == ac->want_keys; } +static int ceph_x_should_authenticate(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + int need; + + ceph_x_validate_tickets(ac, &need); + dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", + ac->want_keys, need, xi->have_keys); + return need != 0; +} + static int ceph_x_encrypt_buflen(int ilen) { return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + @@ -127,7 +138,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, int ret; char *dbuf; char *ticket_buf; - u8 struct_v; + u8 reply_struct_v; dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!dbuf) @@ -139,14 +150,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, goto out_dbuf; ceph_decode_need(&p, end, 1 + sizeof(u32), bad); - struct_v = ceph_decode_8(&p); - if (struct_v != 1) + reply_struct_v = ceph_decode_8(&p); + if (reply_struct_v != 1) goto bad; num = ceph_decode_32(&p); dout("%d tickets\n", num); while (num--) { int type; - u8 struct_v; + u8 tkt_struct_v, blob_struct_v; struct ceph_x_ticket_handler *th; void *dp, *dend; int dlen; @@ -165,8 +176,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, type = ceph_decode_32(&p); dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - struct_v = ceph_decode_8(&p); - if (struct_v != 1) + tkt_struct_v = ceph_decode_8(&p); + if (tkt_struct_v != 1) goto bad; th = get_ticket_handler(ac, type); @@ -186,8 +197,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, dend = dbuf + dlen; dp = dbuf; - struct_v = ceph_decode_8(&dp); - if (struct_v != 1) + tkt_struct_v = ceph_decode_8(&dp); + if (tkt_struct_v != 1) goto bad; memcpy(&old_key, &th->session_key, sizeof(old_key)); @@ -224,7 +235,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, tpend = tp + dlen; dout(" ticket blob is %d bytes\n", dlen); ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); - struct_v = ceph_decode_8(&tp); + blob_struct_v = ceph_decode_8(&tp); new_secret_id = ceph_decode_64(&tp); ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); if (ret) @@ -618,7 +629,9 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, static const struct ceph_auth_client_ops ceph_x_ops = { + .name = "x", .is_authenticated = ceph_x_is_authenticated, + .should_authenticate = ceph_x_should_authenticate, .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, .create_authorizer = ceph_x_create_authorizer, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d940053..ae3e3a3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; - struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); @@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); - if (IS_ERR(msg)) - return PTR_ERR(msg); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + if (!msg) + return -ENOMEM; msg->hdr.tid = cpu_to_le64(flush_tid); @@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) */ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { - struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct inode *inode = &ci->vfs_inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing; @@ -1663,7 +1665,7 @@ ack: static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, unsigned *flush_tid) { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int unlock_session = session ? 0 : 1; int flushing = 0; @@ -1716,10 +1718,9 @@ out_unlocked: static int caps_are_flushed(struct inode *inode, unsigned tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int dirty, i, ret = 1; + int i, ret = 1; spin_lock(&inode->i_lock); - dirty = __ceph_caps_dirty(ci); for (i = 0; i < CEPH_CAP_BITS; i++) if ((ci->i_flushing_caps & (1 << i)) && ci->i_cap_flush_tid[i] <= tid) { @@ -1775,9 +1776,9 @@ out: spin_unlock(&ci->i_unsafe_lock); } -int ceph_fsync(struct file *file, struct dentry *dentry, int datasync) +int ceph_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); unsigned flush_tid; int ret; @@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&inode->i_lock); if (__ceph_caps_dirty(ci)) @@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 0c2241e..2fa992e 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -19,7 +19,7 @@ * Ceph release version */ #define CEPH_VERSION_MAJOR 0 -#define CEPH_VERSION_MINOR 19 +#define CEPH_VERSION_MINOR 20 #define CEPH_VERSION_PATCH 0 #define _CEPH_STRINGIFY(x) #x @@ -36,7 +36,7 @@ * client-facing protocol. */ #define CEPH_OSD_PROTOCOL 8 /* cluster internal */ -#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 12 /* cluster internal */ #define CEPH_MON_PROTOCOL 5 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 24 /* server/client */ #define CEPH_MDSC_PROTOCOL 32 /* server/client */ @@ -53,8 +53,18 @@ /* * feature bits */ -#define CEPH_FEATURE_SUPPORTED 0 -#define CEPH_FEATURE_REQUIRED 0 +#define CEPH_FEATURE_UID 1 +#define CEPH_FEATURE_NOSRCADDR 2 +#define CEPH_FEATURE_FLOCK 4 + +#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK +#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR /* @@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_AUTH_NONE 0x1 #define CEPH_AUTH_CEPHX 0x2 +#define CEPH_AUTH_UID_DEFAULT ((__u64) -1) + /********************************************* * message layer @@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_MSG_CLIENT_SNAP 0x312 #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 +/* pool ops */ +#define CEPH_MSG_POOLOP_REPLY 48 +#define CEPH_MSG_POOLOP 49 + + /* osd */ #define CEPH_MSG_OSD_MAP 41 #define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OPREPLY 43 +/* pool operations */ +enum { + POOL_OP_CREATE = 0x01, + POOL_OP_DELETE = 0x02, + POOL_OP_AUID_CHANGE = 0x03, + POOL_OP_CREATE_SNAP = 0x11, + POOL_OP_DELETE_SNAP = 0x12, + POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, + POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, +}; + struct ceph_mon_request_header { __le64 have_version; __le16 session_mon; @@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply { struct ceph_statfs st; } __attribute__ ((packed)); +const char *ceph_pool_op_name(int op); + +struct ceph_mon_poolop { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 pool; + __le32 op; + __le64 auid; + __le64 snapid; + __le32 name_len; +} __attribute__ ((packed)); + +struct ceph_mon_poolop_reply { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 reply_code; + __le32 epoch; + char has_data; + char data[0]; +} __attribute__ ((packed)); + +struct ceph_mon_unmanaged_snap { + __le64 snapid; +} __attribute__ ((packed)); + struct ceph_osd_getmap { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; @@ -212,16 +265,17 @@ extern const char *ceph_mds_state_name(int s); * - they also define the lock ordering by the MDS * - a few of these are internal to the mds */ -#define CEPH_LOCK_DN 1 -#define CEPH_LOCK_ISNAP 2 -#define CEPH_LOCK_IVERSION 4 /* mds internal */ -#define CEPH_LOCK_IFILE 8 /* mds internal */ -#define CEPH_LOCK_IAUTH 32 -#define CEPH_LOCK_ILINK 64 -#define CEPH_LOCK_IDFT 128 /* dir frag tree */ -#define CEPH_LOCK_INEST 256 /* mds internal */ -#define CEPH_LOCK_IXATTR 512 -#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ +#define CEPH_LOCK_DVERSION 1 +#define CEPH_LOCK_DN 2 +#define CEPH_LOCK_ISNAP 16 +#define CEPH_LOCK_IVERSION 32 /* mds internal */ +#define CEPH_LOCK_IFILE 64 +#define CEPH_LOCK_IAUTH 128 +#define CEPH_LOCK_ILINK 256 +#define CEPH_LOCK_IDFT 512 /* dir frag tree */ +#define CEPH_LOCK_INEST 1024 /* mds internal */ +#define CEPH_LOCK_IXATTR 2048 +#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ /* client_session ops */ enum { @@ -308,6 +362,7 @@ union ceph_mds_request_args { struct { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; } __attribute__ ((packed)) readdir; struct { __le32 mode; diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c index 8e4be6a..7503aee 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/ceph_strings.c @@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type) case CEPH_ENTITY_TYPE_OSD: return "osd"; case CEPH_ENTITY_TYPE_MON: return "mon"; case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_ADMIN: return "admin"; case CEPH_ENTITY_TYPE_AUTH: return "auth"; default: return "unknown"; } @@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; case CEPH_OSD_OP_PULL: return "pull"; case CEPH_OSD_OP_PUSH: return "push"; @@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o) } return "???"; } + +const char *ceph_pool_op_name(int op) +{ + switch (op) { + case POOL_OP_CREATE: return "create"; + case POOL_OP_DELETE: return "delete"; + case POOL_OP_AUID_CHANGE: return "auid change"; + case POOL_OP_CREATE_SNAP: return "create snap"; + case POOL_OP_DELETE_SNAP: return "delete snap"; + case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; + case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; + } + return "???"; +} diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f7048da..3be33fb 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p) static int monc_show(struct seq_file *s, void *p) { struct ceph_client *client = s->private; - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct ceph_mon_client *monc = &client->monc; struct rb_node *rp; @@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p) if (monc->want_next_osdmap) seq_printf(s, "want next osdmap\n"); - for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { - req = rb_entry(rp, struct ceph_mon_statfs_request, node); - seq_printf(s, "%lld statfs\n", req->tid); + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%lld statfs\n", req->tid); + else + seq_printf(s, "%lld unknown\n", req->tid); } mutex_unlock(&monc->mutex); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 650d2db..f857193 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry) return -ENOMEM; /* oh well */ spin_lock(&dentry->d_lock); - if (dentry->d_fsdata) /* lost a race */ + if (dentry->d_fsdata) { + /* lost a race */ + kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; + } di->dentry = dentry; di->lease_session = NULL; dentry->d_fsdata = di; @@ -125,7 +128,8 @@ more: dentry = list_entry(p, struct dentry, d_u.d_child); di = ceph_dentry(dentry); while (1) { - dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, + dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, + d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { fi->at_end = 1; @@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; const int max_entries = client->mount_args->max_readdir; + const int max_bytes = client->mount_args->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); if (fi->at_end) @@ -312,6 +317,7 @@ more: req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.max_entries = cpu_to_le32(max_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { @@ -335,7 +341,7 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 0; + fi->next_offset = 2; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, @@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; /* .snap dir? */ @@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, !is_root_ceph_dentry(dir, dentry) && (ci->i_ceph_flags & CEPH_I_COMPLETE) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { - di->offset = ci->i_max_offset++; spin_unlock(&dir->i_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); @@ -582,7 +587,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); if (IS_ERR(req)) - return ERR_PTR(PTR_ERR(req)); + return ERR_CAST(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; /* we only need inode linkage */ @@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ - new_dentry->d_time = jiffies; - ceph_dentry(new_dentry)->lease_shared_gen = 0; + ceph_invalidate_dentry_lease(new_dentry); } ceph_mdsc_put_request(req); return err; } +/* + * Ensure a dentry lease will no longer revalidate. + */ +void ceph_invalidate_dentry_lease(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; + ceph_dentry(dentry)->lease_shared_gen = 0; + spin_unlock(&dentry->d_lock); +} /* * Check if dentry lease is valid. If not, delete the lease. Try to @@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir = dentry->d_parent->d_inode; - dout("d_revalidate %p '%.*s' inode %p\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode, + ceph_dentry(dentry)->offset); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { @@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct ceph_inode_info *ci = ceph_inode(inode); int left; - if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { @@ -1092,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, * an fsync() on a dir will wait for any uncommitted directory * operations to commit. */ -static int ceph_dir_fsync(struct file *file, struct dentry *dentry, - int datasync) +static int ceph_dir_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_path.dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct list_head *head = &ci->i_unsafe_dirops; struct ceph_mds_request *req; @@ -1152,7 +1166,7 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_add_tail(&di->lru, &mdsc->dentry_lru); mdsc->num_dentry++; @@ -1165,10 +1179,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, - dn->d_name.len, dn->d_name.name); + dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, + dn->d_name.len, dn->d_name.name, di->offset); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_move_tail(&di->lru, &mdsc->dentry_lru); spin_unlock(&mdsc->dentry_lru_lock); @@ -1183,7 +1197,7 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_del_init(&di->lru); mdsc->num_dentry--; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9d67572..4480cb1 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", fh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); @@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, static struct dentry *__cfh_to_dentry(struct super_block *sb, struct ceph_nfs_confh *cfh) { - struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; @@ -133,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, USE_ANY_MDS); if (IS_ERR(req)) - return ERR_PTR(PTR_ERR(req)); + return ERR_CAST(req); req->r_ino1 = vino; req->r_ino2.ino = cfh->parent_ino; @@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, } dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", cfh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { @@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, return ERR_PTR(-ESTALE); dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", cfh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7d63493..6251a15 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) - return ERR_PTR(PTR_ERR(req)); + return ERR_CAST(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { @@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages) /* * allocate a vector new pages */ -static struct page **alloc_page_vector(int num_pages) +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) { struct page **pages; int i; - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + pages = kmalloc(sizeof(*pages) * num_pages, flags); if (!pages) return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); + pages[i] = __page_cache_alloc(flags); if (pages[i] == NULL) { ceph_release_page_vector(pages, i); return ERR_PTR(-ENOMEM); @@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, * in sequence. */ } else { - pages = alloc_page_vector(num_pages); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); } if (IS_ERR(pages)) return PTR_ERR(pages); @@ -649,8 +649,8 @@ more: do_sync, ci->i_truncate_seq, ci->i_truncate_size, &mtime, false, 2); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; num_pages = calc_pages_for(pos, len); @@ -668,7 +668,7 @@ more: truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_CACHE_SIZE-1)); } else { - pages = alloc_page_vector(num_pages); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; loff_t endoff = pos + iov->iov_len; int got = 0; int ret, err; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 85b4d2f..226f5a5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) BUG_ON(!S_ISDIR(parent->i_mode)); if (IS_ERR(inode)) - return ERR_PTR(PTR_ERR(inode)); + return inode; inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; @@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - &ceph_client(ci->vfs_inode.i_sb)->mdsc; + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); @@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + xattr_blob = NULL; } inode->i_mapping->a_ops = &ceph_aops; inode->i_mapping->backing_dev_info = - &ceph_client(inode->i_sb)->backing_dev_info; + &ceph_sb_to_client(inode->i_sb)->backing_dev_info; switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode, /* set dir completion flag? */ if (ci->i_files == 0 && ci->i_subdirs == 0 && ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); ci->i_ceph_flags |= CEPH_I_COMPLETE; ci->i_max_offset = 2; } /* it may be better to set st_size in getattr instead? */ - if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) + if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) inode->i_size = ci->i_rbytes; break; default: @@ -802,6 +804,37 @@ out_unlock: } /* + * Set dentry's directory position based on the current dir's max, and + * order it in d_subdirs, so that dcache_readdir behaves. + */ +static void ceph_set_dentry_offset(struct dentry *dn) +{ + struct dentry *dir = dn->d_parent; + struct inode *inode = dn->d_parent->d_inode; + struct ceph_dentry_info *di; + + BUG_ON(!inode); + + di = ceph_dentry(dn); + + spin_lock(&inode->i_lock); + if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + spin_unlock(&inode->i_lock); + return; + } + di->offset = ceph_inode(inode)->i_max_offset++; + spin_unlock(&inode->i_lock); + + spin_lock(&dcache_lock); + spin_lock(&dn->d_lock); + list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); + dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, + dn->d_u.d_child.prev, dn->d_u.d_child.next); + spin_unlock(&dn->d_lock); + spin_unlock(&dcache_lock); +} + +/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, { struct dentry *realdn; + BUG_ON(dn->d_inode); + /* dn must be unhashed */ if (!d_unhashed(dn)) d_drop(dn); @@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, dn = realdn; } else { BUG_ON(!ceph_dentry(dn)); - dout("dn %p attached to %p ino %llx.%llx\n", dn, dn->d_inode, ceph_vinop(dn->d_inode)); } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); + ceph_set_dentry_offset(dn); out: return dn; } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - di = ceph_dentry(dn); - - spin_lock(&inode->i_lock); - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&inode->i_lock); - - spin_lock(&dcache_lock); - spin_lock(&dn->d_lock); - list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dcache_lock); -} - -/* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., * after a lookup). @@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, if (!rinfo->head->is_target && !rinfo->head->is_dentry) { dout("fill_trace reply is empty!\n"); - if (rinfo->head->result == 0 && req->r_locked_dir) { - struct ceph_inode_info *ci = - ceph_inode(req->r_locked_dir); - dout(" clearing %p complete (empty trace)\n", - req->r_locked_dir); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - } + if (rinfo->head->result == 0 && req->r_locked_dir) + ceph_invalidate_dir_request(req); return 0; } @@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, dn, dn->d_name.len, dn->d_name.name); + /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ - dn->d_time = jiffies; - ceph_dentry(dn)->lease_shared_gen = 0; + ceph_invalidate_dentry_lease(dn); + /* take overwritten dentry's readdir offset */ + dout("dn %p gets %p offset %lld (old offset %lld)\n", + req->r_old_dentry, dn, ceph_dentry(dn)->offset, + ceph_dentry(req->r_old_dentry)->offset); ceph_dentry(req->r_old_dentry)->offset = ceph_dentry(dn)->offset; + dn = req->r_old_dentry; /* use old_dentry */ in = dn->d_inode; } @@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - ceph_set_dentry_offset(dn); igrab(in); } else if (ceph_ino(in) == vino.ino && ceph_snap(in) == vino.snap) { @@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, err = PTR_ERR(dn); goto done; } - ceph_set_dentry_offset(dn); req->r_dentry = dn; /* may have spliced */ igrab(in); rinfo->head->is_dentry = 1; /* fool notrace handlers */ @@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); - if (queue_work(ceph_client(inode->i_sb)->trunc_wq, + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); igrab(inode); @@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) struct inode *parent_inode = dentry->d_parent->d_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; int issued; int release = 0, dirtied = 0; int mask = 0; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8a5bcae..d085f07 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_ioctl_dataloc dl; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; u64 len = 1, olen; u64 tmp; struct ceph_object_layout ol; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 24561a5..b49f128 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -40,7 +40,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); -const static struct ceph_connection_operations mds_con_ops; +static const struct ceph_connection_operations mds_con_ops; /* @@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); - if (IS_ERR(msg)) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } h = msg->front.iov_base; h->op = cpu_to_le32(op); @@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc, struct ceph_msg *msg; int mstate; int mds = session->s_mds; - int err = 0; /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); @@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc, /* send connect message */ msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); - if (IS_ERR(msg)) { - err = PTR_ERR(msg); - goto out; - } + if (!msg) + return -ENOMEM; ceph_con_send(&session->s_con, msg); - -out: return 0; } @@ -804,12 +799,49 @@ out: } static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) + void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + int drop = 0; + dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); - ceph_remove_cap(cap); + spin_lock(&inode->i_lock); + __ceph_remove_cap(cap); + if (!__ceph_is_any_real_caps(ci)) { + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + pr_info(" dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + drop = 1; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_info(" dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + drop = 1; + } + if (drop && ci->i_wrbuffer_ref) { + pr_info(" dropping dirty data for %p %lld\n", + inode, ceph_ino(inode)); + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + drop++; + } + spin_unlock(&mdsc->cap_dirty_lock); + } + spin_unlock(&inode->i_lock); + while (drop--) + iput(inode); return 0; } @@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session) dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, NULL); BUG_ON(session->s_nr_caps > 0); + BUG_ON(!list_empty(&session->s_cap_flushing)); cleanup_cap_releases(session); } @@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, ceph_mds_state_name(state)); msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); - if (IS_ERR(msg)) - return PTR_ERR(msg); + if (!msg) + return -ENOMEM; ceph_con_send(&session->s_con, msg); return 0; } @@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *msg; - int err = 0; dout("request_close_session mds%d state %s seq %lld\n", session->s_mds, session_state_name(session->s_state), session->s_seq); msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); - if (IS_ERR(msg)) - err = PTR_ERR(msg); - else - ceph_con_send(&session->s_con, msg); - return err; + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; } /* @@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc, while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - 0, 0, NULL); + GFP_NOFS); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg; dout("send_cap_releases mds%d\n", session->s_mds); - while (1) { - spin_lock(&session->s_cap_lock); - if (list_empty(&session->s_cap_releases_done)) - break; + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases_done)) { msg = list_first_entry(&session->s_cap_releases_done, struct ceph_msg, list_head); list_del_init(&msg->list_head); @@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); + spin_lock(&session->s_cap_lock); } spin_unlock(&session->s_cap_lock); } +static void discard_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + unsigned num; + + dout("discard_cap_releases mds%d\n", session->s_mds); + spin_lock(&session->s_cap_lock); + + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); + head->num = cpu_to_le32(0); + session->s_num_cap_releases += num; + + /* requeue completed messages */ + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, + num); + session->s_num_cap_releases += num; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + list_add(&msg->list_head, &session->s_cap_releases); + } + + spin_unlock(&session->s_cap_lock); +} + /* * requests */ @@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) if (!req) return ERR_PTR(-ENOMEM); + mutex_init(&req->r_fill_mutex); req->r_started = jiffies; req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); @@ -1251,7 +1320,7 @@ retry: len += 1 + temp->d_name.len; temp = temp->d_parent; if (temp == NULL) { - pr_err("build_path_dentry corrupt dentry %p\n", dentry); + pr_err("build_path corrupt dentry %p\n", dentry); return ERR_PTR(-EINVAL); } } @@ -1267,7 +1336,7 @@ retry: struct inode *inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { - dout("build_path_dentry path+%d: %p SNAPDIR\n", + dout("build_path path+%d: %p SNAPDIR\n", pos, temp); } else if (stop_on_nosnap && inode && ceph_snap(inode) == CEPH_NOSNAP) { @@ -1278,20 +1347,18 @@ retry: break; strncpy(path + pos, temp->d_name.name, temp->d_name.len); - dout("build_path_dentry path+%d: %p '%.*s'\n", - pos, temp, temp->d_name.len, path + pos); } if (pos) path[--pos] = '/'; temp = temp->d_parent; if (temp == NULL) { - pr_err("build_path_dentry corrupt dentry\n"); + pr_err("build_path corrupt dentry\n"); kfree(path); return ERR_PTR(-EINVAL); } } if (pos != 0) { - pr_err("build_path_dentry did not end path lookup where " + pr_err("build_path did not end path lookup where " "expected, namelen is %d, pos is %d\n", len, pos); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not @@ -1303,7 +1370,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; - dout("build_path_dentry on %p %d built %llx '%.*s'\n", + dout("build_path on %p %d built %llx '%.*s'\n", dentry, atomic_read(&dentry->d_count), *base, len, path); return path; } @@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); - if (IS_ERR(msg)) + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + if (!msg) { + msg = ERR_PTR(-ENOMEM); goto out_free2; + } msg->hdr.tid = cpu_to_le64(req->r_tid); @@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, } msg = create_request_message(mdsc, req, mds); if (IS_ERR(msg)) { - req->r_reply = ERR_PTR(PTR_ERR(msg)); + req->r_err = PTR_ERR(msg); complete_request(mdsc, req); - return -PTR_ERR(msg); + return PTR_ERR(msg); } req->r_request = msg; @@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_reply) + if (req->r_err || req->r_got_result) goto out; if (req->r_timeout && @@ -1609,7 +1678,7 @@ out: return err; finish: - req->r_reply = ERR_PTR(err); + req->r_err = err; complete_request(mdsc, req); goto out; } @@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc, /* * Wake up threads with requests pending for @mds, so that they can - * resubmit their requests to a possibly different mds. If @all is set, - * wake up if their requests has been forwarded to @mds, too. + * resubmit their requests to a possibly different mds. */ -static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) +static void kick_requests(struct ceph_mds_client *mdsc, int mds) { struct ceph_mds_request *req; struct rb_node *p; @@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, __register_request(mdsc, req, dir); __do_request(mdsc, req); - /* wait */ - if (!req->r_reply) { - mutex_unlock(&mdsc->mutex); - if (req->r_timeout) { - err = (long)wait_for_completion_interruptible_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - req->r_reply = ERR_PTR(-EIO); - else if (err < 0) - req->r_reply = ERR_PTR(err); - } else { - err = wait_for_completion_interruptible( - &req->r_completion); - if (err) - req->r_reply = ERR_PTR(err); - } - mutex_lock(&mdsc->mutex); + if (req->r_err) { + err = req->r_err; + __unregister_request(mdsc, req); + dout("do_request early error %d\n", err); + goto out; } - if (IS_ERR(req->r_reply)) { - err = PTR_ERR(req->r_reply); - req->r_reply = NULL; + /* wait */ + mutex_unlock(&mdsc->mutex); + dout("do_request waiting\n"); + if (req->r_timeout) { + err = (long)wait_for_completion_killable_timeout( + &req->r_completion, req->r_timeout); + if (err == 0) + err = -EIO; + } else { + err = wait_for_completion_killable(&req->r_completion); + } + dout("do_request waited, got %d\n", err); + mutex_lock(&mdsc->mutex); - if (err == -ERESTARTSYS) { - /* aborted */ - req->r_aborted = true; + /* only abort if we didn't race with a real reply */ + if (req->r_got_result) { + err = le32_to_cpu(req->r_reply_info.head->result); + } else if (err < 0) { + dout("aborted request %lld with %d\n", req->r_tid, err); - if (req->r_locked_dir && - (req->r_op & CEPH_MDS_OP_WRITE)) { - struct ceph_inode_info *ci = - ceph_inode(req->r_locked_dir); + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + req->r_aborted = true; + mutex_unlock(&req->r_fill_mutex); - dout("aborted, clearing I_COMPLETE on %p\n", - req->r_locked_dir); - spin_lock(&req->r_locked_dir->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - spin_unlock(&req->r_locked_dir->i_lock); - } - } else { - /* clean up this request */ - __unregister_request(mdsc, req); - if (!list_empty(&req->r_unsafe_item)) - list_del_init(&req->r_unsafe_item); - complete(&req->r_safe_completion); - } - } else if (req->r_err) { - err = req->r_err; + if (req->r_locked_dir && + (req->r_op & CEPH_MDS_OP_WRITE)) + ceph_invalidate_dir_request(req); } else { - err = le32_to_cpu(req->r_reply_info.head->result); + err = req->r_err; } - mutex_unlock(&mdsc->mutex); +out: + mutex_unlock(&mdsc->mutex); dout("do_request %p done, result %d\n", req, err); return err; } /* + * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * namespace request. + */ +void ceph_invalidate_dir_request(struct ceph_mds_request *req) +{ + struct inode *inode = req->r_locked_dir; + struct ceph_inode_info *ci = ceph_inode(inode); + + dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); + spin_lock(&inode->i_lock); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; + spin_unlock(&inode->i_lock); + + if (req->r_dentry) + ceph_invalidate_dentry_lease(req->r_dentry); + if (req->r_old_dentry) + ceph_invalidate_dentry_lease(req->r_old_dentry); +} + +/* * Handle mds reply. * * We take the session mutex and parse and process the reply immediately. @@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } + if (req->r_got_safe && !head->safe) { + pr_warning("got unsafe after safe on %llu from mds%d\n", + tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } result = le32_to_cpu(head->result); @@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } - } - - BUG_ON(req->r_reply); - - if (!head->safe) { + } else { req->r_got_unsafe = true; list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); } @@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* insert trace into our cache */ + mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(&req->r_caps_reservation); } + mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); out_err: - if (err) { - req->r_err = err; + mutex_lock(&mdsc->mutex); + if (!req->r_aborted) { + if (err) { + req->r_err = err; + } else { + req->r_reply = msg; + ceph_msg_get(msg); + req->r_got_result = true; + } } else { - req->r_reply = msg; - ceph_msg_get(msg); + dout("reply arrived after request %lld was aborted\n", tid); } + mutex_unlock(&mdsc->mutex); add_cap_releases(mdsc, req->r_session, -1); mutex_unlock(&session->s_mutex); @@ -1921,16 +2014,21 @@ static void handle_forward(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); req = __lookup_request(mdsc, tid); if (!req) { - dout("forward %llu to mds%d - req dne\n", tid, next_mds); + dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); goto out; /* dup reply? */ } - if (fwd_seq <= req->r_num_fwd) { - dout("forward %llu to mds%d - old seq %d <= %d\n", + if (req->r_aborted) { + dout("forward tid %llu aborted, unregistering\n", tid); + __unregister_request(mdsc, req); + } else if (fwd_seq <= req->r_num_fwd) { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", tid, next_mds, req->r_num_fwd, fwd_seq); } else { /* resend. forward race not possible; mds would drop */ - dout("forward %llu to mds%d (we resend)\n", tid, next_mds); + dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); + BUG_ON(req->r_err); + BUG_ON(req->r_got_result); req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); @@ -1984,6 +2082,8 @@ static void handle_session(struct ceph_mds_session *session, switch (op) { case CEPH_SESSION_OPEN: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect success\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_OPEN; renewed_caps(mdsc, session, 0); wake = 1; @@ -1997,10 +2097,12 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_CLOSE: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ complete(&mdsc->session_close_waiters); - kick_requests(mdsc, mds, 0); /* cur only */ + kick_requests(mdsc, mds); break; case CEPH_SESSION_STALE: @@ -2132,54 +2234,44 @@ out: * * called with mdsc->mutex held. */ -static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) +static void send_mds_reconnect(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { - struct ceph_mds_session *session = NULL; struct ceph_msg *reply; struct rb_node *p; + int mds = session->s_mds; int err = -ENOMEM; struct ceph_pagelist *pagelist; - pr_info("reconnect to recovering mds%d\n", mds); + pr_info("mds%d reconnect start\n", mds); pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); if (!pagelist) goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + if (!reply) goto fail_nomsg; - } - - /* find session */ - session = __ceph_lookup_mds_session(mdsc, mds); - mutex_unlock(&mdsc->mutex); /* drop lock for duration */ - if (session) { - mutex_lock(&session->s_mutex); + mutex_lock(&session->s_mutex); + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; - session->s_state = CEPH_MDS_SESSION_RECONNECTING; - session->s_seq = 0; + ceph_con_open(&session->s_con, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - ceph_con_open(&session->s_con, - ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - - /* replay unsafe requests */ - replay_unsafe_requests(mdsc, session); - } else { - dout("no session for mds%d, will send short reconnect\n", - mds); - } + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); down_read(&mdsc->snap_rwsem); - if (!session) - goto send; dout("session %p state %s\n", session, session_state_name(session->s_state)); + /* drop old cap expires; we're about to reestablish that state */ + discard_cap_releases(mdsc, session); + /* traverse this session's caps */ err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); if (err) @@ -2208,36 +2300,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) goto fail; } -send: reply->pagelist = pagelist; reply->hdr.data_len = cpu_to_le32(pagelist->length); reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); - session->s_state = CEPH_MDS_SESSION_OPEN; mutex_unlock(&session->s_mutex); mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); - ceph_put_mds_session(session); - up_read(&mdsc->snap_rwsem); - mutex_lock(&mdsc->mutex); return; fail: ceph_msg_put(reply); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); fail_nomsg: ceph_pagelist_release(pagelist); kfree(pagelist); fail_nopagelist: pr_err("error %d preparing reconnect for mds%d\n", err, mds); - mutex_lock(&mdsc->mutex); return; } @@ -2290,7 +2375,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } /* kick any requests waiting on the recovering mds */ - kick_requests(mdsc, i, 1); + kick_requests(mdsc, i); } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -2299,22 +2384,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, * send reconnect? */ if (s->s_state == CEPH_MDS_SESSION_RESTARTING && - newstate >= CEPH_MDS_STATE_RECONNECT) - send_mds_reconnect(mdsc, i); + newstate >= CEPH_MDS_STATE_RECONNECT) { + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + mutex_lock(&mdsc->mutex); + } /* - * kick requests on any mds that has gone active. - * - * kick requests on cur or forwarder: we may have sent - * the request to mds1, mds1 told us it forwarded it - * to mds2, but then we learn mds1 failed and can't be - * sure it successfully forwarded our request before - * it died. + * kick request on any mds that has gone active. */ if (oldstate < CEPH_MDS_STATE_ACTIVE && newstate >= CEPH_MDS_STATE_ACTIVE) { - pr_info("mds%d reconnect completed\n", s->s_mds); - kick_requests(mdsc, i, 1); + if (oldstate != CEPH_MDS_STATE_CREATING && + oldstate != CEPH_MDS_STATE_STARTING) + pr_info("mds%d recovery completed\n", s->s_mds); + kick_requests(mdsc, i); ceph_kick_flushing_caps(mdsc, s); wake_up_session_caps(s, 1); } @@ -2457,12 +2541,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); - if (IS_ERR(msg)) + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + if (!msg) return; lease = msg->front.iov_base; lease->action = action; - lease->mask = cpu_to_le16(CEPH_LOCK_DN); + lease->mask = cpu_to_le16(1); lease->ino = cpu_to_le64(ceph_vino(inode).ino); lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); @@ -2492,7 +2576,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, BUG_ON(inode == NULL); BUG_ON(dentry == NULL); - BUG_ON(mask != CEPH_LOCK_DN); + BUG_ON(mask == 0); /* is dentry lease valid? */ spin_lock(&dentry->d_lock); @@ -2603,7 +2687,9 @@ static void delayed_work(struct work_struct *work) else ceph_con_keepalive(&s->s_con); add_cap_releases(mdsc, s, -1); - send_cap_releases(mdsc, s); + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG) + send_cap_releases(mdsc, s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); @@ -2620,6 +2706,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) mdsc->client = client; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); + if (mdsc->mdsmap == NULL) + return -ENOMEM; + init_completion(&mdsc->safe_umount_waiters); init_completion(&mdsc->session_close_waiters); INIT_LIST_HEAD(&mdsc->waiting_for_map); @@ -2645,6 +2734,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) init_waitqueue_head(&mdsc->cap_flushing_wq); spin_lock_init(&mdsc->dentry_lru_lock); INIT_LIST_HEAD(&mdsc->dentry_lru); + return 0; } @@ -2740,6 +2830,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; + if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + return; + dout("sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; @@ -2922,9 +3015,10 @@ static void con_put(struct ceph_connection *con) static void peer_reset(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; - pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", - s->s_mds); + pr_warning("mds%d closed our session\n", s->s_mds); + send_mds_reconnect(mdsc, s); } static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) @@ -3031,7 +3125,7 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&mdsc->client->monc); } -const static struct ceph_connection_operations mds_con_ops = { +static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, .dispatch = dispatch, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 961cc6f..d9936c4 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -165,6 +165,8 @@ struct ceph_mds_request { struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ struct inode *r_target_inode; /* resulting inode */ + struct mutex r_fill_mutex; + union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ @@ -213,7 +215,7 @@ struct ceph_mds_request { struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; struct list_head r_unsafe_item; /* per-session unsafe list item */ - bool r_got_unsafe, r_got_safe; + bool r_got_unsafe, r_got_safe, r_got_result; bool r_did_prepopulate; u32 r_readdir_offset; @@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, struct dentry *dn, int mask); +extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); + extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index cd4fadb..64b8b1f 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); static void ceph_fault(struct ceph_connection *con); -const char *ceph_name_type_str(int t) -{ - switch (t) { - case CEPH_ENTITY_TYPE_MON: return "mon"; - case CEPH_ENTITY_TYPE_MDS: return "mds"; - case CEPH_ENTITY_TYPE_OSD: return "osd"; - case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_ADMIN: return "admin"; - default: return "???"; - } -} - /* * nicely render a sockaddr as a string. */ @@ -132,6 +120,12 @@ void ceph_msgr_exit(void) destroy_workqueue(ceph_msgr_wq); } +void ceph_msgr_flush() +{ + flush_workqueue(ceph_msgr_wq); +} + + /* * socket callback functions */ @@ -340,6 +334,7 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } + con->out_keepalive_pending = false; con->in_seq = 0; con->in_seq_acked = 0; } @@ -357,6 +352,7 @@ void ceph_con_close(struct ceph_connection *con) clear_bit(WRITE_PENDING, &con->state); mutex_lock(&con->mutex); reset_connection(con); + con->peer_global_seq = 0; cancel_delayed_work(&con->work); mutex_unlock(&con->mutex); queue_con(con); @@ -661,7 +657,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = CEPH_FEATURE_SUPPORTED; + con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT; con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1124,8 +1120,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED; - u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; + u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1233,6 +1229,7 @@ static int process_connect(struct ceph_connection *con) clear_bit(CONNECTING, &con->state); con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; + con->peer_features = server_feat; dout("process_connect got READY gseq %d cseq %d (%d)\n", con->peer_global_seq, le32_to_cpu(con->in_reply.connect_seq), @@ -1402,19 +1399,17 @@ static int read_partial_message(struct ceph_connection *con) con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (skip) { /* skip this message */ - dout("alloc_msg returned NULL, skipping message\n"); + dout("alloc_msg said skip message\n"); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; return 0; } - if (IS_ERR(con->in_msg)) { - ret = PTR_ERR(con->in_msg); - con->in_msg = NULL; + if (!con->in_msg) { con->error_msg = "error allocating memory for incoming message"; - return ret; + return -ENOMEM; } m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ @@ -1514,14 +1509,14 @@ static void process_message(struct ceph_connection *con) /* if first message, set peer_name */ if (con->peer_name.type == 0) - con->peer_name = msg->hdr.src.name; + con->peer_name = msg->hdr.src; con->in_seq++; mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), - ENTITY_NAME(msg->hdr.src.name), + ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), @@ -1546,7 +1541,6 @@ static int try_write(struct ceph_connection *con) dout("try_write start %p state %lu nref %d\n", con, con->state, atomic_read(&con->nref)); - mutex_lock(&con->mutex); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); @@ -1639,7 +1633,6 @@ do_next: done: ret = 0; out: - mutex_unlock(&con->mutex); dout("try_write done on %p\n", con); return ret; } @@ -1651,7 +1644,6 @@ out: */ static int try_read(struct ceph_connection *con) { - struct ceph_messenger *msgr; int ret = -1; if (!con->sock) @@ -1661,9 +1653,6 @@ static int try_read(struct ceph_connection *con) return 0; dout("try_read start on %p\n", con); - msgr = con->msgr; - - mutex_lock(&con->mutex); more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, @@ -1758,7 +1747,6 @@ more: done: ret = 0; out: - mutex_unlock(&con->mutex); dout("try_read done on %p\n", con); return ret; @@ -1830,6 +1818,8 @@ more: dout("con_work %p start, clearing QUEUED\n", con); clear_bit(QUEUED, &con->state); + mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); con_close_socket(con); @@ -1844,11 +1834,16 @@ more: if (test_and_clear_bit(SOCK_CLOSED, &con->state) || try_read(con) < 0 || try_write(con) < 0) { + mutex_unlock(&con->mutex); backoff = 1; ceph_fault(con); /* error/fault path */ + goto done_unlocked; } done: + mutex_unlock(&con->mutex); + +done_unlocked: clear_bit(BUSY, &con->state); dout("con->state=%lu\n", con->state); if (test_bit(QUEUED, &con->state)) { @@ -1947,7 +1942,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) /* the zero page is needed if a request is "canceled" while the message * is being written over the socket */ - msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); if (!msgr->zero_page) { kfree(msgr); return ERR_PTR(-ENOMEM); @@ -1987,9 +1982,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) } /* set src+dst */ - msg->hdr.src.name = con->msgr->inst.name; - msg->hdr.src.addr = con->msgr->my_enc_addr; - msg->hdr.orig_src = msg->hdr.src; + msg->hdr.src = con->msgr->inst.name; BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); @@ -2083,12 +2076,11 @@ void ceph_con_keepalive(struct ceph_connection *con) * construct a new message with given type, size * the new msg has a ref count of 1. */ -struct ceph_msg *ceph_msg_new(int type, int front_len, - int page_len, int page_off, struct page **pages) +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) { struct ceph_msg *m; - m = kmalloc(sizeof(*m), GFP_NOFS); + m = kmalloc(sizeof(*m), flags); if (m == NULL) goto out; kref_init(&m->kref); @@ -2100,8 +2092,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); m->hdr.middle_len = 0; - m->hdr.data_len = cpu_to_le32(page_len); - m->hdr.data_off = cpu_to_le16(page_off); + m->hdr.data_len = 0; + m->hdr.data_off = 0; m->hdr.reserved = 0; m->footer.front_crc = 0; m->footer.middle_crc = 0; @@ -2115,11 +2107,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, /* front */ if (front_len) { if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, GFP_NOFS, + m->front.iov_base = __vmalloc(front_len, flags, PAGE_KERNEL); m->front_is_vmalloc = true; } else { - m->front.iov_base = kmalloc(front_len, GFP_NOFS); + m->front.iov_base = kmalloc(front_len, flags); } if (m->front.iov_base == NULL) { pr_err("msg_new can't allocate %d bytes\n", @@ -2135,19 +2127,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m->middle = NULL; /* data */ - m->nr_pages = calc_pages_for(page_off, page_len); - m->pages = pages; + m->nr_pages = 0; + m->pages = NULL; m->pagelist = NULL; - dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, - m->nr_pages); + dout("ceph_msg_new %p front %d\n", m, front_len); return m; out2: ceph_msg_put(m); out: - pr_err("msg_new can't create type %d len %d\n", type, front_len); - return ERR_PTR(-ENOMEM); + pr_err("msg_new can't create type %d front %d\n", type, front_len); + return NULL; } /* @@ -2190,29 +2181,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (IS_ERR(msg)) - return msg; - - if (*skip) + if (!msg || *skip) return NULL; } if (!msg) { *skip = 0; - msg = ceph_msg_new(type, front_len, 0, 0, NULL); + msg = ceph_msg_new(type, front_len, GFP_NOFS); if (!msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); - return ERR_PTR(-ENOMEM); + return NULL; } } memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); - if (middle_len) { + if (middle_len && !msg->middle) { ret = ceph_alloc_middle(con, msg); - if (ret < 0) { ceph_msg_put(msg); - return msg; + return NULL; } } diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index a5caf91..76fbc95 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -49,10 +49,8 @@ struct ceph_connection_operations { int *skip); }; -extern const char *ceph_name_type_str(int t); - /* use format string %s%d */ -#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) +#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) struct ceph_messenger { struct ceph_entity_inst inst; /* my name+address */ @@ -144,6 +142,7 @@ struct ceph_connection { struct ceph_entity_addr peer_addr; /* peer address */ struct ceph_entity_name peer_name; /* peer name */ struct ceph_entity_addr peer_addr_for_me; + unsigned peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ @@ -158,7 +157,6 @@ struct ceph_connection { struct list_head out_queue; struct list_head out_sent; /* sending or sent but unacked */ u64 out_seq; /* last message queued for send */ - u64 out_seq_sent; /* last message sent */ bool out_keepalive_pending; u64 in_seq, in_seq_acked; /* last message received, acked */ @@ -215,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end, extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); +extern void ceph_msgr_flush(void); extern struct ceph_messenger *ceph_messenger_create( struct ceph_entity_addr *myaddr); @@ -234,9 +233,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); extern void ceph_con_put(struct ceph_connection *con); -extern struct ceph_msg *ceph_msg_new(int type, int front_len, - int page_len, int page_off, - struct page **pages); +extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); extern void ceph_msg_kfree(struct ceph_msg *m); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 8fdc011..21c62e9 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -28,7 +28,7 @@ * resend any outstanding requests. */ -const static struct ceph_connection_operations mon_con_ops; +static const struct ceph_connection_operations mon_con_ops; static int __validate_auth(struct ceph_mon_client *monc); @@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) monc->pending_auth = 1; monc->m_auth->front.iov_len = len; monc->m_auth->hdr.front_len = cpu_to_le32(len); + ceph_con_revoke(monc->con, monc->m_auth); ceph_msg_get(monc->m_auth); /* keep our ref */ ceph_con_send(monc->con, monc->m_auth); } @@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc) monc->want_next_osdmap); if ((__sub_expired(monc) && !monc->sub_sent) || monc->want_next_osdmap == 1) { - struct ceph_msg *msg; + struct ceph_msg *msg = monc->m_subscribe; struct ceph_mon_subscribe_item *i; void *p, *end; - msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL); - if (!msg) - return; - p = msg->front.iov_base; - end = p + msg->front.iov_len; + end = p + msg->front_max; dout("__send_subscribe to 'mdsmap' %u+\n", (unsigned)monc->have_mdsmap); @@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_con_send(monc->con, msg); + ceph_con_revoke(monc->con, msg); + ceph_con_send(monc->con, ceph_msg_get(msg)); monc->sub_sent = jiffies | 1; /* never 0 */ } @@ -353,14 +351,14 @@ out: /* * statfs */ -static struct ceph_mon_statfs_request *__lookup_statfs( +static struct ceph_mon_generic_request *__lookup_generic_req( struct ceph_mon_client *monc, u64 tid) { - struct ceph_mon_statfs_request *req; - struct rb_node *n = monc->statfs_request_tree.rb_node; + struct ceph_mon_generic_request *req; + struct rb_node *n = monc->generic_request_tree.rb_node; while (n) { - req = rb_entry(n, struct ceph_mon_statfs_request, node); + req = rb_entry(n, struct ceph_mon_generic_request, node); if (tid < req->tid) n = n->rb_left; else if (tid > req->tid) @@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs( return NULL; } -static void __insert_statfs(struct ceph_mon_client *monc, - struct ceph_mon_statfs_request *new) +static void __insert_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *new) { - struct rb_node **p = &monc->statfs_request_tree.rb_node; + struct rb_node **p = &monc->generic_request_tree.rb_node; struct rb_node *parent = NULL; - struct ceph_mon_statfs_request *req = NULL; + struct ceph_mon_generic_request *req = NULL; while (*p) { parent = *p; - req = rb_entry(parent, struct ceph_mon_statfs_request, node); + req = rb_entry(parent, struct ceph_mon_generic_request, node); if (new->tid < req->tid) p = &(*p)->rb_left; else if (new->tid > req->tid) @@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc, } rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &monc->statfs_request_tree); + rb_insert_color(&new->node, &monc->generic_request_tree); +} + +static void release_generic_request(struct kref *kref) +{ + struct ceph_mon_generic_request *req = + container_of(kref, struct ceph_mon_generic_request, kref); + + if (req->reply) + ceph_msg_put(req->reply); + if (req->request) + ceph_msg_put(req->request); +} + +static void put_generic_request(struct ceph_mon_generic_request *req) +{ + kref_put(&req->kref, release_generic_request); +} + +static void get_generic_request(struct ceph_mon_generic_request *req) +{ + kref_get(&req->kref); +} + +static struct ceph_msg *get_generic_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + struct ceph_mon_generic_request *req; + u64 tid = le64_to_cpu(hdr->tid); + struct ceph_msg *m; + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (!req) { + dout("get_generic_reply %lld dne\n", tid); + *skip = 1; + m = NULL; + } else { + dout("get_generic_reply %lld got %p\n", tid, req->reply); + m = ceph_msg_get(req->reply); + /* + * we don't need to track the connection reading into + * this reply because we only have one open connection + * at a time, ever. + */ + } + mutex_unlock(&monc->mutex); + return m; } static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct ceph_mon_statfs_reply *reply = msg->front.iov_base; - u64 tid; + u64 tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len != sizeof(*reply)) goto bad; - tid = le64_to_cpu(msg->hdr.tid); dout("handle_statfs_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); - req = __lookup_statfs(monc, tid); + req = __lookup_generic_req(monc, tid); if (req) { - *req->buf = reply->st; + *(struct ceph_statfs *)req->buf = reply->st; req->result = 0; + get_generic_request(req); } mutex_unlock(&monc->mutex); - if (req) + if (req) { complete(&req->completion); + put_generic_request(req); + } return; bad: - pr_err("corrupt statfs reply, no tid\n"); + pr_err("corrupt generic reply, no tid\n"); ceph_msg_dump(msg); } /* - * (re)send a statfs request + * Do a synchronous statfs(). */ -static int send_statfs(struct ceph_mon_client *monc, - struct ceph_mon_statfs_request *req) +int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) { - struct ceph_msg *msg; + struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; + int err; - dout("send_statfs tid %llu\n", req->tid); - msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); - if (IS_ERR(msg)) - return PTR_ERR(msg); - req->request = msg; - msg->hdr.tid = cpu_to_le64(req->tid); - h = msg->front.iov_base; + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); + if (!req->reply) + goto out; + + /* fill out request */ + h = req->request->front.iov_base; h->monhdr.have_version = 0; h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; - ceph_con_send(monc->con, msg); - return 0; -} - -/* - * Do a synchronous statfs(). - */ -int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) -{ - struct ceph_mon_statfs_request req; - int err; - - req.buf = buf; - init_completion(&req.completion); - - /* allocate memory for reply */ - err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1); - if (err) - return err; /* register request */ mutex_lock(&monc->mutex); - req.tid = ++monc->last_tid; - req.last_attempt = jiffies; - req.delay = BASE_DELAY_INTERVAL; - __insert_statfs(monc, &req); - monc->num_statfs_requests++; + req->tid = ++monc->last_tid; + req->request->hdr.tid = cpu_to_le64(req->tid); + __insert_generic_request(monc, req); + monc->num_generic_requests++; mutex_unlock(&monc->mutex); /* send request and wait */ - err = send_statfs(monc, &req); - if (!err) - err = wait_for_completion_interruptible(&req.completion); + ceph_con_send(monc->con, ceph_msg_get(req->request)); + err = wait_for_completion_interruptible(&req->completion); mutex_lock(&monc->mutex); - rb_erase(&req.node, &monc->statfs_request_tree); - monc->num_statfs_requests--; - ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1); + rb_erase(&req->node, &monc->generic_request_tree); + monc->num_generic_requests--; mutex_unlock(&monc->mutex); if (!err) - err = req.result; + err = req->result; + +out: + kref_put(&req->kref, release_generic_request); return err; } /* * Resend pending statfs requests. */ -static void __resend_statfs(struct ceph_mon_client *monc) +static void __resend_generic_request(struct ceph_mon_client *monc) { - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct rb_node *p; - for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_mon_statfs_request, node); - send_statfs(monc, req); + for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mon_generic_request, node); + ceph_con_revoke(monc->con, req->request); + ceph_con_send(monc->con, ceph_msg_get(req->request)); } } @@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; - /* msg pools */ - err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, - sizeof(struct ceph_mon_subscribe_ack), 1, false); - if (err < 0) + /* msgs */ + err = -ENOMEM; + monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, + sizeof(struct ceph_mon_subscribe_ack), + GFP_NOFS); + if (!monc->m_subscribe_ack) goto out_monmap; - err = ceph_msgpool_init(&monc->msgpool_statfs_reply, - sizeof(struct ceph_mon_statfs_reply), 0, false); - if (err < 0) - goto out_pool1; - err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); - if (err < 0) - goto out_pool2; - - monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); + + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); + if (!monc->m_subscribe) + goto out_subscribe_ack; + + monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); + if (!monc->m_auth_reply) + goto out_subscribe; + + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); monc->pending_auth = 0; - if (IS_ERR(monc->m_auth)) { - err = PTR_ERR(monc->m_auth); - monc->m_auth = NULL; - goto out_pool3; - } + if (!monc->m_auth) + goto out_auth_reply; monc->cur_mon = -1; monc->hunting = true; @@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->sub_sent = 0; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); - monc->statfs_request_tree = RB_ROOT; - monc->num_statfs_requests = 0; + monc->generic_request_tree = RB_ROOT; + monc->num_generic_requests = 0; monc->last_tid = 0; monc->have_mdsmap = 0; @@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->want_next_osdmap = 1; return 0; -out_pool3: - ceph_msgpool_destroy(&monc->msgpool_auth_reply); -out_pool2: - ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); -out_pool1: - ceph_msgpool_destroy(&monc->msgpool_statfs_reply); +out_auth_reply: + ceph_msg_put(monc->m_auth_reply); +out_subscribe: + ceph_msg_put(monc->m_subscribe); +out_subscribe_ack: + ceph_msg_put(monc->m_subscribe_ack); out_monmap: kfree(monc->monmap); out: @@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ceph_auth_destroy(monc->auth); ceph_msg_put(monc->m_auth); - ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); - ceph_msgpool_destroy(&monc->msgpool_statfs_reply); - ceph_msgpool_destroy(&monc->msgpool_auth_reply); + ceph_msg_put(monc->m_auth_reply); + ceph_msg_put(monc->m_subscribe); + ceph_msg_put(monc->m_subscribe_ack); kfree(monc->monmap); } @@ -662,8 +704,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { int ret; + int was_auth = 0; mutex_lock(&monc->mutex); + if (monc->auth->ops) + was_auth = monc->auth->ops->is_authenticated(monc->auth); monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, @@ -674,14 +719,14 @@ static void handle_auth_reply(struct ceph_mon_client *monc, wake_up(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); - } else if (monc->auth->ops->is_authenticated(monc->auth)) { + } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; monc->client->msgr->inst.name.num = monc->auth->global_id; __send_subscribe(monc); - __resend_statfs(monc); + __resend_generic_request(monc); } mutex_unlock(&monc->mutex); } @@ -770,18 +815,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_MON_SUBSCRIBE_ACK: - m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); + m = ceph_msg_get(monc->m_subscribe_ack); break; case CEPH_MSG_STATFS_REPLY: - m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); - break; + return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: - m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); + m = ceph_msg_get(monc->m_auth_reply); break; case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: - m = ceph_msg_new(type, front_len, 0, 0, NULL); + m = ceph_msg_new(type, front_len, GFP_NOFS); break; } @@ -826,7 +870,7 @@ out: mutex_unlock(&monc->mutex); } -const static struct ceph_connection_operations mon_con_ops = { +static const struct ceph_connection_operations mon_con_ops = { .get = ceph_con_get, .put = ceph_con_put, .dispatch = dispatch, diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index b958ad5..174d794 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -2,10 +2,10 @@ #define _FS_CEPH_MON_CLIENT_H #include <linux/completion.h> +#include <linux/kref.h> #include <linux/rbtree.h> #include "messenger.h" -#include "msgpool.h" struct ceph_client; struct ceph_mount_args; @@ -22,7 +22,7 @@ struct ceph_monmap { }; struct ceph_mon_client; -struct ceph_mon_statfs_request; +struct ceph_mon_generic_request; /* @@ -40,17 +40,19 @@ struct ceph_mon_request { }; /* - * statfs() is done a bit differently because we need to get data back + * ceph_mon_generic_request is being used for the statfs and poolop requests + * which are bening done a bit differently because we need to get data back * to the caller */ -struct ceph_mon_statfs_request { +struct ceph_mon_generic_request { + struct kref kref; u64 tid; struct rb_node node; int result; - struct ceph_statfs *buf; + void *buf; struct completion completion; - unsigned long last_attempt, delay; /* jiffies */ struct ceph_msg *request; /* original request */ + struct ceph_msg *reply; /* and reply */ }; struct ceph_mon_client { @@ -61,7 +63,7 @@ struct ceph_mon_client { struct delayed_work delayed_work; struct ceph_auth_client *auth; - struct ceph_msg *m_auth; + struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack; int pending_auth; bool hunting; @@ -70,14 +72,9 @@ struct ceph_mon_client { struct ceph_connection *con; bool have_fsid; - /* msg pools */ - struct ceph_msgpool msgpool_subscribe_ack; - struct ceph_msgpool msgpool_statfs_reply; - struct ceph_msgpool msgpool_auth_reply; - - /* pending statfs requests */ - struct rb_root statfs_request_tree; - int num_statfs_requests; + /* pending generic requests */ + struct rb_root generic_request_tree; + int num_generic_requests; u64 last_tid; /* mds/osd map */ diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c index ca3b44a..dd65a64 100644 --- a/fs/ceph/msgpool.c +++ b/fs/ceph/msgpool.c @@ -7,180 +7,58 @@ #include "msgpool.h" -/* - * We use msg pools to preallocate memory for messages we expect to - * receive over the wire, to avoid getting ourselves into OOM - * conditions at unexpected times. We take use a few different - * strategies: - * - * - for request/response type interactions, we preallocate the - * memory needed for the response when we generate the request. - * - * - for messages we can receive at any time from the MDS, we preallocate - * a pool of messages we can re-use. - * - * - for writeback, we preallocate some number of messages to use for - * requests and their replies, so that we always make forward - * progress. - * - * The msgpool behaves like a mempool_t, but keeps preallocated - * ceph_msgs strung together on a list_head instead of using a pointer - * vector. This avoids vector reallocation when we adjust the number - * of preallocated items (which happens frequently). - */ +static void *alloc_fn(gfp_t gfp_mask, void *arg) +{ + struct ceph_msgpool *pool = arg; + void *p; + p = ceph_msg_new(0, pool->front_len, gfp_mask); + if (!p) + pr_err("msgpool %s alloc failed\n", pool->name); + return p; +} -/* - * Allocate or release as necessary to meet our target pool size. - */ -static int __fill_msgpool(struct ceph_msgpool *pool) +static void free_fn(void *element, void *arg) { - struct ceph_msg *msg; - - while (pool->num < pool->min) { - dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num, - pool->min); - spin_unlock(&pool->lock); - msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL); - spin_lock(&pool->lock); - if (IS_ERR(msg)) - return PTR_ERR(msg); - msg->pool = pool; - list_add(&msg->list_head, &pool->msgs); - pool->num++; - } - while (pool->num > pool->min) { - msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head); - dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num, - pool->min, msg); - list_del_init(&msg->list_head); - pool->num--; - ceph_msg_kfree(msg); - } - return 0; + ceph_msg_put(element); } int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int min, bool blocking) + int front_len, int size, bool blocking, const char *name) { - int ret; - - dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min); - spin_lock_init(&pool->lock); pool->front_len = front_len; - INIT_LIST_HEAD(&pool->msgs); - pool->num = 0; - pool->min = min; - pool->blocking = blocking; - init_waitqueue_head(&pool->wait); - - spin_lock(&pool->lock); - ret = __fill_msgpool(pool); - spin_unlock(&pool->lock); - return ret; + pool->pool = mempool_create(size, alloc_fn, free_fn, pool); + if (!pool->pool) + return -ENOMEM; + pool->name = name; + return 0; } void ceph_msgpool_destroy(struct ceph_msgpool *pool) { - dout("msgpool_destroy %p\n", pool); - spin_lock(&pool->lock); - pool->min = 0; - __fill_msgpool(pool); - spin_unlock(&pool->lock); + mempool_destroy(pool->pool); } -int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, + int front_len) { - int ret; - - spin_lock(&pool->lock); - dout("msgpool_resv %p delta %d\n", pool, delta); - pool->min += delta; - ret = __fill_msgpool(pool); - spin_unlock(&pool->lock); - return ret; -} - -struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len) -{ - wait_queue_t wait; - struct ceph_msg *msg; - - if (front_len && front_len > pool->front_len) { - pr_err("msgpool_get pool %p need front %d, pool size is %d\n", - pool, front_len, pool->front_len); + if (front_len > pool->front_len) { + pr_err("msgpool_get pool %s need front %d, pool size is %d\n", + pool->name, front_len, pool->front_len); WARN_ON(1); /* try to alloc a fresh message */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; - } - - if (!front_len) - front_len = pool->front_len; - - if (pool->blocking) { - /* mempool_t behavior; first try to alloc */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; + return ceph_msg_new(0, front_len, GFP_NOFS); } - while (1) { - spin_lock(&pool->lock); - if (likely(pool->num)) { - msg = list_entry(pool->msgs.next, struct ceph_msg, - list_head); - list_del_init(&msg->list_head); - pool->num--; - dout("msgpool_get %p got %p, now %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - return msg; - } - pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num, - pool->min, pool->blocking ? "waiting" : "may fail"); - spin_unlock(&pool->lock); - - if (!pool->blocking) { - WARN_ON(1); - - /* maybe we can allocate it now? */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; - - pr_err("msgpool_get %p empty + alloc failed\n", pool); - return ERR_PTR(-ENOMEM); - } - - init_wait(&wait); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - schedule(); - finish_wait(&pool->wait, &wait); - } + return mempool_alloc(pool->pool, GFP_NOFS); } void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) { - spin_lock(&pool->lock); - if (pool->num < pool->min) { - /* reset msg front_len; user may have changed it */ - msg->front.iov_len = pool->front_len; - msg->hdr.front_len = cpu_to_le32(pool->front_len); + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); - kref_set(&msg->kref, 1); /* retake a single ref */ - list_add(&msg->list_head, &pool->msgs); - pool->num++; - dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - wake_up(&pool->wait); - } else { - dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - ceph_msg_kfree(msg); - } + kref_init(&msg->kref); /* retake single ref */ } diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h index bc834bf..a362605f 100644 --- a/fs/ceph/msgpool.h +++ b/fs/ceph/msgpool.h @@ -1,6 +1,7 @@ #ifndef _FS_CEPH_MSGPOOL #define _FS_CEPH_MSGPOOL +#include <linux/mempool.h> #include "messenger.h" /* @@ -8,18 +9,15 @@ * avoid unexpected OOM conditions. */ struct ceph_msgpool { - spinlock_t lock; + const char *name; + mempool_t *pool; int front_len; /* preallocated payload size */ - struct list_head msgs; /* msgs in the pool; each has 1 ref */ - int num, min; /* cur, min # msgs in the pool */ - bool blocking; - wait_queue_head_t wait; }; extern int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int size, bool blocking); + int front_len, int size, bool blocking, + const char *name); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); -extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta); extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, int front_len); extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 8aaab41..892a029 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -50,7 +50,6 @@ struct ceph_entity_name { #define CEPH_ENTITY_TYPE_MDS 0x02 #define CEPH_ENTITY_TYPE_OSD 0x04 #define CEPH_ENTITY_TYPE_CLIENT 0x08 -#define CEPH_ENTITY_TYPE_ADMIN 0x10 #define CEPH_ENTITY_TYPE_AUTH 0x20 #define CEPH_ENTITY_TYPE_ANY 0xFF @@ -120,7 +119,7 @@ struct ceph_msg_connect_reply { /* * message header */ -struct ceph_msg_header { +struct ceph_msg_header_old { __le64 seq; /* message seq# for this session */ __le64 tid; /* transaction id */ __le16 type; /* message type */ @@ -138,6 +137,24 @@ struct ceph_msg_header { __le32 crc; /* header crc32c */ } __attribute__ ((packed)); +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_name src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + #define CEPH_MSG_PRIO_LOW 64 #define CEPH_MSG_PRIO_DEFAULT 127 #define CEPH_MSG_PRIO_HIGH 196 diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 3514f71..d25b4ad 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -16,7 +16,7 @@ #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 -const static struct ceph_connection_operations osd_con_ops; +static const struct ceph_connection_operations osd_con_ops; static int __kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *kickosd); @@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req = kzalloc(sizeof(*req), GFP_NOFS); } if (req == NULL) - return ERR_PTR(-ENOMEM); + return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; @@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); - if (IS_ERR(msg)) { + OSD_OPREPLY_FRONT_LEN, GFP_NOFS); + if (!msg) { ceph_osdc_put_request(req); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } req->r_reply = msg; @@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); - if (IS_ERR(msg)) { + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); + if (!msg) { ceph_osdc_put_request(req); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); @@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd) { dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref)) + if (atomic_dec_and_test(&osd->o_ref)) { + struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; + + if (osd->o_authorizer) + ac->ops->destroy_authorizer(ac, osd->o_authorizer); kfree(osd); + } } /* @@ -715,7 +720,7 @@ static void handle_timeout(struct work_struct *work) * should mark the osd as failed and we should find out about * it from an updated osd map. */ - while (!list_empty(&osdc->req_lru)) { + while (timeout && !list_empty(&osdc->req_lru)) { req = list_entry(osdc->req_lru.next, struct ceph_osd_request, r_req_lru_item); @@ -1078,6 +1083,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); + wake_up(&osdc->client->auth_wq); return; bad: @@ -1087,45 +1093,6 @@ bad: return; } - -/* - * A read request prepares specific pages that data is to be read into. - * When a message is being read off the wire, we call prepare_pages to - * find those pages. - * 0 = success, -1 failure. - */ -static int __prepare_pages(struct ceph_connection *con, - struct ceph_msg_header *hdr, - struct ceph_osd_request *req, - u64 tid, - struct ceph_msg *m) -{ - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; - int ret = -1; - int data_len = le32_to_cpu(hdr->data_len); - unsigned data_off = le16_to_cpu(hdr->data_off); - - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); - - if (!osd) - return -1; - - osdc = osd->o_osdc; - - dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m, - tid, req->r_num_pages, want); - if (unlikely(req->r_num_pages < want)) - goto out; - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - ret = 0; /* success */ -out: - BUG_ON(ret < 0 || m->nr_pages < want); - - return ret; -} - /* * Register request, send initial attempt. */ @@ -1252,11 +1219,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->req_mempool) goto out; - err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); + err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, + "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, - OSD_OPREPLY_FRONT_LEN, 10, true); + OSD_OPREPLY_FRONT_LEN, 10, true, + "osd_op_reply"); if (err < 0) goto out_msgpool; return 0; @@ -1302,8 +1271,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, false, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; /* it may be a short read due to an object boundary */ req->r_pages = pages; @@ -1345,8 +1314,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, snapc, do_sync, truncate_seq, truncate_size, mtime, nofail, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; /* it may be a short write due to an object boundary */ req->r_pages = pages; @@ -1394,7 +1363,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) } /* - * lookup and return message for incoming reply + * lookup and return message for incoming reply. set up reply message + * pages. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, @@ -1407,7 +1377,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, int front = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid; - int err; tid = le64_to_cpu(hdr->tid); mutex_lock(&osdc->request_mutex); @@ -1425,13 +1394,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply, req->r_con_filling_msg); ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); ceph_con_put(req->r_con_filling_msg); + req->r_con_filling_msg = NULL; } if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", front, (int)req->r_reply->front.iov_len); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); - if (IS_ERR(m)) + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); + if (!m) goto out; ceph_msg_put(req->r_reply); req->r_reply = m; @@ -1439,12 +1409,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - err = __prepare_pages(con, hdr, req, tid, m); - if (err < 0) { + unsigned data_off = le16_to_cpu(hdr->data_off); + int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + + if (unlikely(req->r_num_pages < want)) { + pr_warning("tid %lld reply %d > expected %d pages\n", + tid, want, m->nr_pages); *skip = 1; ceph_msg_put(m); - m = ERR_PTR(err); + m = NULL; + goto out; } + m->pages = req->r_pages; + m->nr_pages = req->r_num_pages; } *skip = 0; req->r_con_filling_msg = ceph_con_get(con); @@ -1466,7 +1443,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_OSD_MAP: - return ceph_msg_new(type, front, 0, 0, NULL); + return ceph_msg_new(type, front, GFP_NOFS); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: @@ -1552,7 +1529,7 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } -const static struct ceph_connection_operations osd_con_ops = { +static const struct ceph_connection_operations osd_con_ops = { .get = get_osd_con, .put = put_osd_con, .dispatch = dispatch, diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index cfdd8f4..ddc656f 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -706,7 +706,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, len, *p, end); newcrush = crush_decode(*p, min(*p+len, end)); if (IS_ERR(newcrush)) - return ERR_PTR(PTR_ERR(newcrush)); + return ERR_CAST(newcrush); } /* new flags? */ diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c index 5f8dbf7..b6859f4 100644 --- a/fs/ceph/pagelist.c +++ b/fs/ceph/pagelist.c @@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl) static int ceph_pagelist_addpage(struct ceph_pagelist *pl) { - struct page *page = alloc_page(GFP_NOFS); + struct page *page = __page_cache_alloc(GFP_NOFS); if (!page) return -ENOMEM; pl->room += PAGE_SIZE; diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index fd56451..8fcc023 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -101,8 +101,8 @@ struct ceph_pg_pool { __le64 snap_seq; /* seq for per-pool snapshot */ __le32 snap_epoch; /* epoch of last snap */ __le32 num_snaps; - __le32 num_removed_snap_intervals; - __le64 uid; + __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ + __le64 auid; /* who owns the pg */ } __attribute__ ((packed)); /* @@ -208,6 +208,7 @@ enum { /* read */ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, /* write */ CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, @@ -305,6 +306,22 @@ enum { #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ #define EBLACKLISTED ESHUTDOWN /* blacklisted */ +/* xattr comparison */ +enum { + CEPH_OSD_CMPXATTR_OP_NOP = 0, + CEPH_OSD_CMPXATTR_OP_EQ = 1, + CEPH_OSD_CMPXATTR_OP_NE = 2, + CEPH_OSD_CMPXATTR_OP_GT = 3, + CEPH_OSD_CMPXATTR_OP_GTE = 4, + CEPH_OSD_CMPXATTR_OP_LT = 5, + CEPH_OSD_CMPXATTR_OP_LTE = 6 +}; + +enum { + CEPH_OSD_CMPXATTR_MODE_STRING = 1, + CEPH_OSD_CMPXATTR_MODE_U64 = 2 +}; + /* * an individual object operation. each may be accompanied by some data * payload @@ -321,6 +338,8 @@ struct ceph_osd_op { struct { __le32 name_len; __le32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ } __attribute__ ((packed)) xattr; struct { __u8 class_len; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index d5114db..c0b26b6 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; BUG_ON(capsnap->writing); capsnap->size = inode->i_size; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9307bbe..4e0bee2 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -8,14 +8,11 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/parser.h> -#include <linux/rwsem.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> -#include <linux/version.h> -#include <linux/vmalloc.h> #include "decode.h" #include "super.h" @@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) static int ceph_syncfs(struct super_block *sb, int wait) { dout("sync_fs %d\n", wait); - ceph_osdc_sync(&ceph_client(sb)->osdc); - ceph_mdsc_sync(&ceph_client(sb)->mdsc); + ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); + ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); dout("sync_fs %d done\n", wait); return 0; } +static int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} /** * ceph_show_options - Show mount options in /proc/mounts @@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",nocrc"); if (args->flags & CEPH_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); + + if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", args->mount_timeout); + if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); + if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) + seq_printf(m, ",osdtimeout=%d", args->osd_timeout); + if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + args->osd_keepalive_timeout); + if (args->wsize) + seq_printf(m, ",wsize=%d", args->wsize); + if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", args->rsize); + if (args->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); + if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + args->caps_wanted_delay_min); + if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + args->caps_wanted_delay_max); + if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + args->cap_release_safety); + if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); + if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", args->snapdir_name); if (args->name) @@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo) inode_init_once(&ci->vfs_inode); } -static int default_congestion_kb(void) -{ - int congestion_kb; - - /* - * Copied from NFS - * - * congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); - if (congestion_kb > 256*1024) - congestion_kb = 256*1024; - - return congestion_kb; -} - static int __init init_caches(void) { ceph_inode_cachep = kmem_cache_create("ceph_inode_info", @@ -308,7 +333,9 @@ enum { Opt_osd_idle_ttl, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, + Opt_cap_release_safety, Opt_readdir_max_entries, + Opt_readdir_max_bytes, Opt_congestion_kb, Opt_last_int, /* int args above */ @@ -339,7 +366,9 @@ static match_table_t arg_tokens = { {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_cap_release_safety, "cap_release_safety=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, @@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; - args->max_readdir = 1024; + args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + args->max_readdir = CEPH_MAX_READDIR_DEFAULT; + args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; args->congestion_kb = default_congestion_kb(); /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ @@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, case Opt_readdir_max_entries: args->max_readdir = intval; break; + case Opt_readdir_max_bytes: + args->max_readdir_bytes = intval; + break; case Opt_congestion_kb: args->congestion_kb = intval; break; @@ -636,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client) /* unmount */ ceph_mdsc_stop(&client->mdsc); - ceph_monc_stop(&client->monc); ceph_osdc_stop(&client->osdc); + /* + * make sure mds and osd connections close out before destroying + * the auth module, which is needed to free those connections' + * ceph_authorizers. + */ + ceph_msgr_flush(); + + ceph_monc_stop(&client->monc); + ceph_adjust_min_caps(-client->min_caps); ceph_debugfs_client_cleanup(client); @@ -682,9 +723,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) /* * true if we have the mon map (and have thus joined the cluster) */ -static int have_mon_map(struct ceph_client *client) +static int have_mon_and_osd_map(struct ceph_client *client) { - return client->monc.monmap && client->monc.monmap->epoch; + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; } /* @@ -704,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client, dout("open_root_inode opening '%s'\n", path); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) - return ERR_PTR(PTR_ERR(req)); + return ERR_CAST(req); req->r_path1 = kstrdup(path, GFP_NOFS); req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; @@ -762,7 +804,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, if (err < 0) goto out; - while (!have_mon_map(client)) { + while (!have_mon_and_osd_map(client)) { err = -EIO; if (timeout && time_after_eq(jiffies, started + timeout)) goto out; @@ -770,8 +812,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, /* wait */ dout("mount waiting for mon_map\n"); err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_map(client) || (client->auth_err < 0), - timeout); + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); if (err == -EINTR || err == -ERESTARTSYS) goto out; if (client->auth_err < 0) { @@ -884,6 +926,8 @@ static int ceph_compare_super(struct super_block *sb, void *data) /* * construct our own bdi so we can control readahead, etc. */ +static atomic_long_t bdi_seq = ATOMIC_INIT(0); + static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) { int err; @@ -893,7 +937,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) client->backing_dev_info.ra_pages = (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); + err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", + atomic_long_inc_return(&bdi_seq)); if (!err) sb->s_bdi = &client->backing_dev_info; return err; @@ -932,9 +977,9 @@ static int ceph_get_sb(struct file_system_type *fs_type, goto out; } - if (ceph_client(sb) != client) { + if (ceph_sb_to_client(sb) != client) { ceph_destroy_client(client); - client = ceph_client(sb); + client = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", client); } else { dout("get_sb using new client %p\n", client); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 13513b8..10a4a40 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -10,7 +10,6 @@ #include <linux/fs.h> #include <linux/mempool.h> #include <linux/pagemap.h> -#include <linux/slab.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> @@ -52,24 +51,25 @@ struct ceph_mount_args { int sb_flags; + int flags; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; int num_mon; struct ceph_entity_addr *mon_addr; - int flags; int mount_timeout; int osd_idle_ttl; - int caps_wanted_delay_min, caps_wanted_delay_max; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int wsize; - int rsize; /* max readahead */ - int max_readdir; /* max readdir size */ - int congestion_kb; /* max readdir size */ int osd_timeout; int osd_keepalive_timeout; + int wsize; + int rsize; /* max readahead */ + int congestion_kb; /* max writeback in flight */ + int caps_wanted_delay_min, caps_wanted_delay_max; + int cap_release_safety; + int max_readdir; /* max readdir result (entires) */ + int max_readdir_bytes; /* max readdir result (bytes) */ char *snapdir_name; /* default ".snap" */ char *name; char *secret; - int cap_release_safety; }; /* @@ -80,13 +80,14 @@ struct ceph_mount_args { #define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" #define CEPH_AUTH_NAME_DEFAULT "guest" - /* * Delay telling the MDS we no longer want caps, in case we reopen * the file. Delay a minimum amount of time, even if we send a cap @@ -96,6 +97,7 @@ struct ceph_mount_args { #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ +#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) /* mount state */ enum { @@ -160,12 +162,6 @@ struct ceph_client { #endif }; -static inline struct ceph_client *ceph_client(struct super_block *sb) -{ - return sb->s_fs_info; -} - - /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -814,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); -extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync); +extern int ceph_fsync(struct file *file, int datasync); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern int ceph_get_cap_mds(struct inode *inode); @@ -871,6 +867,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); +extern void ceph_invalidate_dentry_lease(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2845422..68aeebc 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -7,7 +7,8 @@ static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_SECURITY_PREFIX, + return !strncmp(name, "ceph.", 5) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); @@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "user.ceph.dir.files", ceph_vxattrcb_files}, - { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, + { true, "ceph.dir.entries", ceph_vxattrcb_entries}, + { true, "ceph.dir.files", ceph_vxattrcb_files}, + { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, + { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, + { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, + { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, + { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, + { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, { true, NULL, NULL } }; @@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "user.ceph.layout", ceph_vxattrcb_layout}, + { true, "ceph.layout", ceph_vxattrcb_layout}, { NULL, NULL } }; @@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci, ci->i_xattrs.names_size -= xattr->name_len; ci->i_xattrs.vals_size -= xattr->val_len; } - if (!xattr) { - pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n", - &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name, - xattr->val); - return -ENOMEM; - } ci->i_xattrs.names_size += name_len; ci->i_xattrs.vals_size += val_len; if (val) @@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ci->i_xattrs.version, ci->i_xattrs.index_version); if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && - (ci->i_xattrs.index_version > ci->i_xattrs.version)) { + (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto list_xattr; } else { spin_unlock(&inode->i_lock); @@ -622,7 +617,7 @@ out: static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct inode *parent_inode = dentry->d_parent->d_inode; @@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; err = -ENOMEM; for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); + pages[i] = __page_cache_alloc(GFP_NOFS); if (!pages[i]) { nr_pages = i; goto out; @@ -779,7 +774,7 @@ out: static int ceph_send_removexattr(struct dentry *dentry, const char *name) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = &client->mdsc; struct inode *inode = dentry->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 0242ff9..a7eb65c 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data, extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset); extern int cifs_lock(struct file *, int, struct file_lock *); -extern int cifs_fsync(struct file *, struct dentry *, int); +extern int cifs_fsync(struct file *, int); extern int cifs_flush(struct file *, fl_owner_t id); extern int cifs_file_mmap(struct file * , struct vm_area_struct *); extern const struct file_operations cifs_dir_ops; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a83541e..f1ff785 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1676,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, return rc; } -int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) +int cifs_fsync(struct file *file, int datasync) { int xid; int rc = 0; @@ -1688,7 +1688,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) xid = GetXid(); cFYI(1, "Sync file - name: %s datasync: 0x%x", - dentry->d_name.name, datasync); + file->f_path.dentry->d_name.name, datasync); rc = filemap_write_and_wait(inode->i_mapping); if (rc == 0) { diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index d99860a..6b443ff 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -11,8 +11,7 @@ extern int coda_fake_statfs; void coda_destroy_inodecache(void); int coda_init_inodecache(void); -int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, - int datasync); +int coda_fsync(struct file *coda_file, int datasync); void coda_sysctl_init(void); void coda_sysctl_clean(void); diff --git a/fs/coda/file.c b/fs/coda/file.c index 7196077..ad3cd2a 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) return 0; } -int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) +int coda_fsync(struct file *coda_file, int datasync) { struct file *host_file; - struct inode *coda_inode = coda_dentry->d_inode; + struct inode *coda_inode = coda_file->f_path.dentry->d_inode; struct coda_file_info *cfi; int err = 0; diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 773f2ce..ca25d96 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -1,6 +1,6 @@ /* * Pioctl operations for Coda. - * Original version: (C) 1996 Peter Braam + * Original version: (C) 1996 Peter Braam * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University * * Carnegie Mellon encourages users of this code to contribute improvements @@ -23,21 +23,22 @@ #include <linux/coda_fs_i.h> #include <linux/coda_psdev.h> +#include <linux/smp_lock.h> + /* pioctl ops */ static int coda_ioctl_permission(struct inode *inode, int mask); -static int coda_pioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long user_data); +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data); /* exported from this file */ -const struct inode_operations coda_ioctl_inode_operations = -{ +const struct inode_operations coda_ioctl_inode_operations = { .permission = coda_ioctl_permission, .setattr = coda_setattr, }; const struct file_operations coda_ioctl_operations = { .owner = THIS_MODULE, - .ioctl = coda_pioctl, + .unlocked_ioctl = coda_pioctl, }; /* the coda pioctl inode ops */ @@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask) return (mask & MAY_EXEC) ? -EACCES : 0; } -static int coda_pioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long user_data) +static long coda_pioctl(struct file *filp, unsigned int cmd, + unsigned long user_data) { struct path path; - int error; + int error; struct PioctlData data; - struct inode *target_inode = NULL; - struct coda_inode_info *cnp; + struct inode *inode = filp->f_dentry->d_inode; + struct inode *target_inode = NULL; + struct coda_inode_info *cnp; - /* get the Pioctl data arguments from user space */ - if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { - return -EINVAL; - } - - /* - * Look up the pathname. Note that the pathname is in - * user memory, and namei takes care of this - */ - if (data.follow) { - error = user_path(data.path, &path); - } else { - error = user_lpath(data.path, &path); + lock_kernel(); + + /* get the Pioctl data arguments from user space */ + if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { + error = -EINVAL; + goto out; } - - if ( error ) { - return error; - } else { + + /* + * Look up the pathname. Note that the pathname is in + * user memory, and namei takes care of this + */ + if (data.follow) + error = user_path(data.path, &path); + else + error = user_lpath(data.path, &path); + + if (error) + goto out; + else target_inode = path.dentry->d_inode; - } - + /* return if it is not a Coda inode */ - if ( target_inode->i_sb != inode->i_sb ) { + if (target_inode->i_sb != inode->i_sb) { path_put(&path); - return -EINVAL; + error = -EINVAL; + goto out; } /* now proceed to make the upcall */ - cnp = ITOC(target_inode); + cnp = ITOC(target_inode); error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); path_put(&path); - return error; -} +out: + unlock_kernel(); + return error; +} diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index be4392c..66b9cf7 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait) return mask; } -static int coda_psdev_ioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long arg) +static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg) { unsigned int data; @@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = { .read = coda_psdev_read, .write = coda_psdev_write, .poll = coda_psdev_poll, - .ioctl = coda_psdev_ioctl, + .unlocked_ioctl = coda_psdev_ioctl, .open = coda_psdev_open, .release = coda_psdev_release, }; diff --git a/fs/compat.c b/fs/compat.c index 0544873..f0b391c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -568,6 +568,79 @@ out: return ret; } +/* A write operation does a read from user space and vice versa */ +#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) + +ssize_t compat_rw_copy_check_uvector(int type, + const struct compat_iovec __user *uvector, unsigned long nr_segs, + unsigned long fast_segs, struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + compat_ssize_t tot_len; + struct iovec *iov = *ret_pointer = fast_pointer; + ssize_t ret = 0; + int seg; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) + goto out; + + ret = -EINVAL; + if (nr_segs > UIO_MAXIOV || nr_segs < 0) + goto out; + if (nr_segs > fast_segs) { + ret = -ENOMEM; + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) { + *ret_pointer = fast_pointer; + goto out; + } + } + *ret_pointer = iov; + + /* + * Single unix specification: + * We should -EINVAL if an element length is not >= 0 and fitting an + * ssize_t. The total length is fitting an ssize_t + * + * Be careful here because iov_len is a size_t not an ssize_t + */ + tot_len = 0; + ret = -EINVAL; + for (seg = 0; seg < nr_segs; seg++) { + compat_ssize_t tmp = tot_len; + compat_uptr_t buf; + compat_ssize_t len; + + if (__get_user(len, &uvector->iov_len) || + __get_user(buf, &uvector->iov_base)) { + ret = -EFAULT; + goto out; + } + if (len < 0) /* size_t not fitting in compat_ssize_t .. */ + goto out; + tot_len += len; + if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ + goto out; + if (!access_ok(vrfy_dir(type), buf, len)) { + ret = -EFAULT; + goto out; + } + iov->iov_base = compat_ptr(buf); + iov->iov_len = (compat_size_t) len; + uvector++; + iov++; + } + ret = tot_len; + +out: + return ret; +} + static inline long copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) { @@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); ret = copy_iocb(nr, iocb, iocb64); if (!ret) - ret = sys_io_submit(ctx_id, nr, iocb64); + ret = do_io_submit(ctx_id, nr, iocb64, 1); return ret; } @@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, { compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov=iovstack, *vector; + struct iovec *iov; ssize_t ret; - int seg; io_fn_t fn; iov_fn_t fnv; - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - ret = 0; - if (nr_segs == 0) - goto out; - - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ ret = -EINVAL; - if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0)) - goto out; if (!file->f_op) goto out; - if (nr_segs > UIO_FASTIOV) { - ret = -ENOMEM; - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (!iov) - goto out; - } + ret = -EFAULT; if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) goto out; - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. The total length is fitting an ssize_t - * - * Be careful here because iov_len is a size_t not an ssize_t - */ - tot_len = 0; - vector = iov; - ret = -EINVAL; - for (seg = 0 ; seg < nr_segs; seg++) { - compat_ssize_t tmp = tot_len; - compat_ssize_t len; - compat_uptr_t buf; - - if (__get_user(len, &uvector->iov_len) || - __get_user(buf, &uvector->iov_base)) { - ret = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting an compat_ssize_t .. */ - goto out; - tot_len += len; - if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ - goto out; - vector->iov_base = compat_ptr(buf); - vector->iov_len = (compat_size_t) len; - uvector++; - vector++; - } + tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, + UIO_FASTIOV, iovstack, &iov); if (tot_len == 0) { ret = 0; goto out; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index c8af2d9..4164514 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -72,16 +72,11 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) if (!sd) return -EINVAL; - sd_iattr = sd->s_iattr; - - error = inode_change_ok(inode, iattr); - if (error) - return error; - - error = inode_setattr(inode, iattr); + error = simple_setattr(dentry, iattr); if (error) return error; + sd_iattr = sd->s_iattr; if (!sd_iattr) { /* setting attributes for the first time, allocate now */ sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 4d74fc7..0210898 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n" DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); + /* - * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value + * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value * * These functions are exactly the same as the above functions (but use a hex * output for the decimal challenged). For details look at the above unsigned @@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_x32); +/** + * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_x64(const char *name, mode_t mode, + struct dentry *parent, u64 *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_x64); +} +EXPORT_SYMBOL_GPL(debugfs_create_x64); + static int debugfs_size_t_set(void *data, u64 val) { diff --git a/fs/direct-io.c b/fs/direct-io.c index e82adc2..7600aac 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -82,6 +82,8 @@ struct dio { int reap_counter; /* rate limit reaping */ get_block_t *get_block; /* block mapping function */ dio_iodone_t *end_io; /* IO completion function */ + dio_submit_t *submit_io; /* IO submition function */ + loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ @@ -96,6 +98,7 @@ struct dio { unsigned cur_page_offset; /* Offset into it, in bytes */ unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ sector_t cur_page_block; /* Where it starts */ + loff_t cur_page_fs_offset; /* Offset in file */ /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ @@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); } +/** + * dio_end_io - handle the end io action for the given bio + * @bio: The direct io bio thats being completed + * @error: Error if there was one + * + * This is meant to be called by any filesystem that uses their own dio_submit_t + * so that the DIO specific endio actions are dealt with after the filesystem + * has done it's completion work. + */ +void dio_end_io(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + + if (dio->is_async) + dio_bio_end_aio(bio, error); + else + dio_bio_end_io(bio, error); +} +EXPORT_SYMBOL_GPL(dio_end_io); + static int dio_bio_alloc(struct dio *dio, struct block_device *bdev, sector_t first_sector, int nr_vecs) @@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, bio->bi_end_io = dio_bio_end_io; dio->bio = bio; + dio->logical_offset_in_bio = dio->cur_page_fs_offset; return 0; } @@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio) if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); - submit_bio(dio->rw, bio); + if (dio->submit_io) + dio->submit_io(dio->rw, bio, dio->inode, + dio->logical_offset_in_bio); + else + submit_bio(dio->rw, bio); dio->bio = NULL; dio->boundary = 0; + dio->logical_offset_in_bio = 0; } /* @@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio) int ret = 0; if (dio->bio) { + loff_t cur_offset = dio->block_in_file << dio->blkbits; + loff_t bio_next_offset = dio->logical_offset_in_bio + + dio->bio->bi_size; + /* - * See whether this new request is contiguous with the old + * See whether this new request is contiguous with the old. + * + * Btrfs cannot handl having logically non-contiguous requests + * submitted. For exmple if you have + * + * Logical: [0-4095][HOLE][8192-12287] + * Phyiscal: [0-4095] [4096-8181] + * + * We cannot submit those pages together as one BIO. So if our + * current logical offset in the file does not equal what would + * be the next logical offset in the bio, submit the bio we + * have. */ - if (dio->final_block_in_bio != dio->cur_page_block) + if (dio->final_block_in_bio != dio->cur_page_block || + cur_offset != bio_next_offset) dio_bio_submit(dio); /* * Submit now if the underlying fs is about to perform a @@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page, dio->cur_page_offset = offset; dio->cur_page_len = len; dio->cur_page_block = blocknr; + dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; out: return ret; } @@ -935,7 +981,7 @@ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - struct dio *dio) + dio_submit_t submit_io, struct dio *dio) { unsigned long user_addr; unsigned long flags; @@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->get_block = get_block; dio->end_io = end_io; + dio->submit_io = submit_io; dio->final_block_in_bio = -1; dio->next_block_for_io = -1; @@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, } } /* end iovec loop */ - if (ret == -ENOTBLK && (rw & WRITE)) { + if (ret == -ENOTBLK) { /* * The remaining part of the request will be * be handled by buffered I/O when we return @@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, return ret; } -/* - * This is a library function for use by filesystem drivers. - * - * The locking rules are governed by the flags parameter: - * - if the flags value contains DIO_LOCKING we use a fancy locking - * scheme for dumb filesystems. - * For writes this function is called under i_mutex and returns with - * i_mutex held, for reads, i_mutex is not held on entry, but it is - * taken and dropped again before returning. - * For reads and writes i_alloc_sem is taken in shared mode and released - * on I/O completion (which may happen asynchronously after returning to - * the caller). - * - * - if the flags value does NOT contain DIO_LOCKING we don't use any - * internal locking but rather rely on the filesystem to synchronize - * direct I/O reads/writes versus each other and truncate. - * For reads and writes both i_mutex and i_alloc_sem are not held on - * entry and are never taken. - */ ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int flags) + dio_submit_t submit_io, int flags) { int seg; size_t size; @@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, (end > i_size_read(inode))); retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, dio); + nr_segs, blkbits, get_block, end_io, + submit_io, dio); + +out: + return retval; +} +EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc); + +/* + * This is a library function for use by filesystem drivers. + * + * The locking rules are governed by the flags parameter: + * - if the flags value contains DIO_LOCKING we use a fancy locking + * scheme for dumb filesystems. + * For writes this function is called under i_mutex and returns with + * i_mutex held, for reads, i_mutex is not held on entry, but it is + * taken and dropped again before returning. + * For reads and writes i_alloc_sem is taken in shared mode and released + * on I/O completion (which may happen asynchronously after returning to + * the caller). + * + * - if the flags value does NOT contain DIO_LOCKING we don't use any + * internal locking but rather rely on the filesystem to synchronize + * direct I/O reads/writes versus each other and truncate. + * For reads and writes both i_mutex and i_alloc_sem are not held on + * entry and are never taken. + */ +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + ssize_t retval; + retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, + offset, nr_segs, get_block, end_io, submit_io, flags); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again for DIO_LOCKING. + * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in + * their own manner. This is a further example of where the old + * truncate sequence is inadequate. * * NOTE: filesystems with their own locking have to handle this * on their own. @@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (flags & DIO_LOCKING) { if (unlikely((rw & WRITE) && retval < 0)) { loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + if (end > isize) vmtruncate(inode, isize); } } -out: return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 3bdddbc..e8fcf4e 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -274,7 +274,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file) } static int -ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) +ecryptfs_fsync(struct file *file, int datasync) { return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 65dee2f..31ef525 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -805,7 +805,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, - (ia->ia_size & ~PAGE_CACHE_MASK)); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = vmtruncate(inode, ia->ia_size); + rc = simple_setsize(inode, ia->ia_size); if (rc) goto out; lower_ia->ia_size = ia->ia_size; @@ -830,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, goto out; } } - vmtruncate(inode, ia->ia_size); + simple_setsize(inode, ia->ia_size); rc = ecryptfs_write_inode_size_to_metadata(inode); if (rc) { printk(KERN_ERR "Problem with " @@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ + BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; + vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); @@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; + vm_flags |= VM_STACK_INCOMPLETE_SETUP; ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags); @@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } + /* mprotect_fixup is overkill to remove the temporary stack flags */ + vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* @@ -763,7 +768,6 @@ static int de_thread(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; - int count; if (thread_group_empty(tsk)) goto no_thread_group; @@ -780,13 +784,13 @@ static int de_thread(struct task_struct *tsk) spin_unlock_irq(lock); return -EAGAIN; } + sig->group_exit_task = tsk; - zap_other_threads(tsk); + sig->notify_count = zap_other_threads(tsk); + if (!thread_group_leader(tsk)) + sig->notify_count--; - /* Account for the thread group leader hanging around: */ - count = thread_group_leader(tsk) ? 1 : 2; - sig->notify_count = count; - while (atomic_read(&sig->count) > count) { + while (sig->notify_count) { __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irq(lock); schedule(); @@ -1657,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state) struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; struct completion *vfork_done; - int core_waiters; + int core_waiters = -EBUSY; init_completion(&core_state->startup); core_state->dumper.task = tsk; core_state->dumper.next = NULL; - core_waiters = zap_threads(tsk, mm, core_state, exit_code); + + down_write(&mm->mmap_sem); + if (!mm->core_state) + core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); if (unlikely(core_waiters < 0)) @@ -1782,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file) } +/* + * uhm_pipe_setup + * helper function to customize the process used + * to collect the core in userspace. Specifically + * it sets up a pipe and installs it as fd 0 (stdin) + * for the process. Returns 0 on success, or + * PTR_ERR on failure. + * Note that it also sets the core limit to 1. This + * is a special value that we use to trap recursive + * core dumps + */ +static int umh_pipe_setup(struct subprocess_info *info) +{ + struct file *rp, *wp; + struct fdtable *fdt; + struct coredump_params *cp = (struct coredump_params *)info->data; + struct files_struct *cf = current->files; + + wp = create_write_pipe(0); + if (IS_ERR(wp)) + return PTR_ERR(wp); + + rp = create_read_pipe(wp, 0); + if (IS_ERR(rp)) { + free_write_pipe(wp); + return PTR_ERR(rp); + } + + cp->file = wp; + + sys_close(0); + fd_install(0, rp); + spin_lock(&cf->file_lock); + fdt = files_fdtable(cf); + FD_SET(0, fdt->open_fds); + FD_CLR(0, fdt->close_on_exec); + spin_unlock(&cf->file_lock); + + /* and disallow core files too */ + current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; + + return 0; +} + void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; char corename[CORENAME_MAX_SIZE + 1]; struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; - struct inode * inode; const struct cred *old_cred; struct cred *cred; int retval = 0; int flag = 0; - int ispipe = 0; - char **helper_argv = NULL; - int helper_argc = 0; - int dump_count = 0; + int ispipe; static atomic_t core_dump_count = ATOMIC_INIT(0); struct coredump_params cprm = { .signr = signr, @@ -1815,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) binfmt = mm->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; - - cred = prepare_creds(); - if (!cred) { - retval = -ENOMEM; + if (!__get_dumpable(cprm.mm_flags)) goto fail; - } - down_write(&mm->mmap_sem); - /* - * If another thread got here first, or we are not dumpable, bail out. - */ - if (mm->core_state || !__get_dumpable(cprm.mm_flags)) { - up_write(&mm->mmap_sem); - put_cred(cred); + cred = prepare_creds(); + if (!cred) goto fail; - } - /* * We cannot trust fsuid as being the "true" uid of the * process nor do we know its entire history. We only know it @@ -1844,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) } retval = coredump_wait(exit_code, &core_state); - if (retval < 0) { - put_cred(cred); - goto fail; - } + if (retval < 0) + goto fail_creds; old_cred = override_creds(cred); @@ -1865,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(corename, signr); unlock_kernel(); - if ((!ispipe) && (cprm.limit < binfmt->min_coredump)) - goto fail_unlock; - if (ispipe) { - if (cprm.limit == 0) { + int dump_count; + char **helper_argv; + + if (cprm.limit == 1) { /* * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use - * cprm.limit of 0 here as a speacial value. Any - * non-zero limit gets set to RLIM_INFINITY below, but + * cprm.limit of 1 here as a speacial value. Any + * non-1 limit gets set to RLIM_INFINITY below, but * a limit of 0 skips the dump. This is a consistent * way to catch recursive crashes. We can still crash - * if the core_pattern binary sets RLIM_CORE = !0 + * if the core_pattern binary sets RLIM_CORE = !1 * but it runs as root, and can do lots of stupid things * Note that we use task_tgid_vnr here to grab the pid * of the process group leader. That way we get the @@ -1885,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) * core_pattern process dies. */ printk(KERN_WARNING - "Process %d(%s) has RLIMIT_CORE set to 0\n", + "Process %d(%s) has RLIMIT_CORE set to 1\n", task_tgid_vnr(current), current->comm); printk(KERN_WARNING "Aborting core\n"); goto fail_unlock; } + cprm.limit = RLIM_INFINITY; dump_count = atomic_inc_return(&core_dump_count); if (core_pipe_limit && (core_pipe_limit < dump_count)) { @@ -1899,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) goto fail_dropcount; } - helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); + helper_argv = argv_split(GFP_KERNEL, corename+1, NULL); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); goto fail_dropcount; } - cprm.limit = RLIM_INFINITY; - - /* SIGPIPE can happen, but it's just never processed */ - if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, - &cprm.file)) { + retval = call_usermodehelper_fns(helper_argv[0], helper_argv, + NULL, UMH_WAIT_EXEC, umh_pipe_setup, + NULL, &cprm); + argv_free(helper_argv); + if (retval) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); - goto fail_dropcount; + goto close_fail; } - } else + } else { + struct inode *inode; + + if (cprm.limit < binfmt->min_coredump) + goto fail_unlock; + cprm.file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); - if (IS_ERR(cprm.file)) - goto fail_dropcount; - inode = cprm.file->f_path.dentry->d_inode; - if (inode->i_nlink > 1) - goto close_fail; /* multiple links - don't dump */ - if (!ispipe && d_unhashed(cprm.file->f_path.dentry)) - goto close_fail; - - /* AK: actually i see no reason to not allow this for named pipes etc., - but keep the previous behaviour for now. */ - if (!ispipe && !S_ISREG(inode->i_mode)) - goto close_fail; - /* - * Dont allow local users get cute and trick others to coredump - * into their pre-created files: - * Note, this is not relevant for pipes - */ - if (!ispipe && (inode->i_uid != current_fsuid())) - goto close_fail; - if (!cprm.file->f_op) - goto close_fail; - if (!cprm.file->f_op->write) - goto close_fail; - if (!ispipe && - do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0) - goto close_fail; + if (IS_ERR(cprm.file)) + goto fail_unlock; - retval = binfmt->core_dump(&cprm); + inode = cprm.file->f_path.dentry->d_inode; + if (inode->i_nlink > 1) + goto close_fail; + if (d_unhashed(cprm.file->f_path.dentry)) + goto close_fail; + /* + * AK: actually i see no reason to not allow this for named + * pipes etc, but keep the previous behaviour for now. + */ + if (!S_ISREG(inode->i_mode)) + goto close_fail; + /* + * Dont allow local users get cute and trick others to coredump + * into their pre-created files. + */ + if (inode->i_uid != current_fsuid()) + goto close_fail; + if (!cprm.file->f_op || !cprm.file->f_op->write) + goto close_fail; + if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + goto close_fail; + } + retval = binfmt->core_dump(&cprm); if (retval) current->signal->group_exit_code |= 0x80; -close_fail: + if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); - filp_close(cprm.file, NULL); +close_fail: + if (cprm.file) + filp_close(cprm.file, NULL); fail_dropcount: - if (dump_count) + if (ispipe) atomic_dec(&core_dump_count); fail_unlock: - if (helper_argv) - argv_free(helper_argv); - + coredump_finish(mm); revert_creds(old_cred); +fail_creds: put_cred(cred); - coredump_finish(mm); fail: return; } diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 4cfab1c..d91e9d8 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) de->inode_no = cpu_to_le64(parent->i_ino); memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); exofs_set_de_type(de, inode); - kunmap_atomic(page, KM_USER0); + kunmap_atomic(kaddr, KM_USER0); err = exofs_commit_chunk(page, 0, chunk_size); fail: page_cache_release(page); diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 839b9dc..fef6899 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp) return 0; } -static int exofs_file_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int exofs_file_fsync(struct file *filp, int datasync) { int ret; struct address_space *mapping = filp->f_mapping; - struct inode *inode = dentry->d_inode; + struct inode *inode = mapping->host; struct super_block *sb; ret = filemap_write_and_wait(mapping); @@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry, static int exofs_flush(struct file *file, fl_owner_t id) { - exofs_file_fsync(file, file->f_path.dentry, 1); + exofs_file_fsync(file, 1); /* TODO: Flush the OSD target */ return 0; } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d7c6afa..4bb6ef8 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping, return ret; } +static int exofs_releasepage(struct page *page, gfp_t gfp) +{ + EXOFS_DBGMSG("page 0x%lx\n", page->index); + WARN_ON(1); + return try_to_free_buffers(page); +} + +static void exofs_invalidatepage(struct page *page, unsigned long offset) +{ + EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page)); + WARN_ON(1); + + block_invalidatepage(page, offset); +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, @@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = { .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, + .releasepage = exofs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .invalidatepage = exofs_invalidatepage, + + /* Not implemented Yet */ + .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ + .direct_IO = NULL, /* TODO: Should be trivial to do */ + + /* With these NULL has special meaning or default is not exported */ + .sync_page = NULL, + .get_xip_mem = NULL, + .migratepage = NULL, + .launder_page = NULL, + .is_partially_uptodate = NULL, + .error_remove_page = NULL, }; /****************************************************************************** diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 0b038e4..52b34f1 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_delete_inode (struct inode *); extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern void ext2_truncate (struct inode *); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); extern void ext2_get_inode_flags(struct ext2_inode_info *); @@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *); extern const struct file_operations ext2_dir_operations; /* file.c */ -extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync); +extern int ext2_fsync(struct file *file, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; extern const struct file_operations ext2_xip_file_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 5d198d0..49eec94 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp) return 0; } -int ext2_fsync(struct file *file, struct dentry *dentry, int datasync) +int ext2_fsync(struct file *file, int datasync) { int ret; - struct super_block *sb = dentry->d_inode->i_sb; + struct super_block *sb = file->f_mapping->host->i_sb; struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; - ret = simple_fsync(file, dentry, datasync); + ret = generic_file_fsync(file, datasync); if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { /* We don't really know where the IO error happened... */ ext2_error(sb, __func__, @@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = { #endif const struct inode_operations ext2_file_inode_operations = { - .truncate = ext2_truncate, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 527c46d..1921443 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -54,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) inode->i_blocks - ea_blocks == 0); } +static void ext2_truncate_blocks(struct inode *inode, loff_t offset); + +static void ext2_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + ext2_truncate_blocks(inode, inode->i_size); + } +} + /* * Called at the last iput() if i_nlink is zero. */ @@ -71,7 +83,7 @@ void ext2_delete_inode (struct inode * inode) inode->i_size = 0; if (inode->i_blocks) - ext2_truncate (inode); + ext2_truncate_blocks(inode, 0); ext2_free_inode (inode); return; @@ -757,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext2_get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, ext2_get_block); } static int @@ -766,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int ret; + *pagep = NULL; - return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata); + ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata); + if (ret < 0) + ext2_write_failed(mapping, pos + len); + return ret; +} + +static int ext2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + ext2_write_failed(mapping, pos + len); + return ret; } static int @@ -775,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int ret; + /* * Dir-in-pagecache still uses ext2_write_begin. Would have to rework * directory handling code to pass around offsets rather than struct * pages in order to make this work easily. */ - return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext2_get_block); + ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep, + fsdata, ext2_get_block); + if (ret < 0) + ext2_write_failed(mapping, pos + len); + return ret; } static int ext2_nobh_writepage(struct page *page, @@ -800,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, ext2_get_block, NULL); + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + + ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, ext2_get_block, NULL); + if (ret < 0 && (rw & WRITE)) + ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); + return ret; } static int @@ -818,7 +857,7 @@ const struct address_space_operations ext2_aops = { .writepage = ext2_writepage, .sync_page = block_sync_page, .write_begin = ext2_write_begin, - .write_end = generic_write_end, + .write_end = ext2_write_end, .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, @@ -1027,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de ext2_free_data(inode, p, q); } -void ext2_truncate(struct inode *inode) +static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) { __le32 *i_data = EXT2_I(inode)->i_data; struct ext2_inode_info *ei = EXT2_I(inode); @@ -1039,27 +1078,8 @@ void ext2_truncate(struct inode *inode) int n; long iblock; unsigned blocksize; - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (ext2_inode_is_fast_symlink(inode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - blocksize = inode->i_sb->s_blocksize; - iblock = (inode->i_size + blocksize-1) - >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); - - if (mapping_is_xip(inode->i_mapping)) - xip_truncate_page(inode->i_mapping, inode->i_size); - else if (test_opt(inode->i_sb, NOBH)) - nobh_truncate_page(inode->i_mapping, - inode->i_size, ext2_get_block); - else - block_truncate_page(inode->i_mapping, - inode->i_size, ext2_get_block); + iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); n = ext2_block_to_path(inode, iblock, offsets, NULL); if (n == 0) @@ -1127,6 +1147,62 @@ do_indirects: ext2_discard_reservation(inode); mutex_unlock(&ei->truncate_mutex); +} + +static void ext2_truncate_blocks(struct inode *inode, loff_t offset) +{ + /* + * XXX: it seems like a bug here that we don't allow + * IS_APPEND inode to have blocks-past-i_size trimmed off. + * review and fix this. + * + * Also would be nice to be able to handle IO errors and such, + * but that's probably too much to ask. + */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (ext2_inode_is_fast_symlink(inode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + __ext2_truncate_blocks(inode, offset); +} + +int ext2_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, newsize); + if (error) + return error; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (ext2_inode_is_fast_symlink(inode)) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + if (mapping_is_xip(inode->i_mapping)) + error = xip_truncate_page(inode->i_mapping, newsize); + else if (test_opt(inode->i_sb, NOBH)) + error = nobh_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + else + error = block_truncate_page(inode->i_mapping, + newsize, ext2_get_block); + if (error) + return error; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + truncate_pagecache(inode, oldsize, newsize); + + __ext2_truncate_blocks(inode, newsize); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); @@ -1134,6 +1210,8 @@ do_indirects: } else { mark_inode_dirty(inode); } + + return 0; } static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, @@ -1474,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; } - error = inode_setattr(inode, iattr); - if (!error && (iattr->ia_valid & ATTR_MODE)) + if (iattr->ia_valid & ATTR_SIZE) { + error = ext2_setsize(inode, iattr->ia_size); + if (error) + return error; + } + generic_setattr(inode, iattr); + if (iattr->ia_valid & ATTR_MODE) error = ext2_acl_chmod(inode); + mark_inode_dirty(inode); + return error; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 71e9eb1..7ff43f4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -119,6 +119,8 @@ static void ext2_put_super (struct super_block * sb) int i; struct ext2_sb_info *sbi = EXT2_SB(sb); + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + if (sb->s_dirt) ext2_write_super(sb); @@ -1063,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ext2_sops; sb->s_export_op = &ext2_export_ops; sb->s_xattr = ext2_xattr_handlers; + +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &dquot_quotactl_ops; +#endif + root = ext2_iget(sb, EXT2_ROOT_INO); if (IS_ERR(root)) { ret = PTR_ERR(root); @@ -1241,6 +1249,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) spin_unlock(&sbi->s_lock); return 0; } + /* * OK, we are remounting a valid rw partition rdonly, so set * the rdonly flag and then mark the partition as valid again. @@ -1248,6 +1257,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) es->s_state = cpu_to_le16(sbi->s_mount_state); es->s_mtime = cpu_to_le32(get_seconds()); spin_unlock(&sbi->s_lock); + + err = dquot_suspend(sb, -1); + if (err < 0) { + spin_lock(&sbi->s_lock); + goto restore_opts; + } + ext2_sync_super(sb, es, 1); } else { __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, @@ -1269,8 +1285,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) if (!ext2_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; spin_unlock(&sbi->s_lock); + ext2_write_super(sb); + + dquot_resume(sb, -1); } + return 0; restore_opts: sbi->s_mount_opt = old_opts.s_mount_opt; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 373fa90..e2e72c3 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root) kfree (old); } if (!parent) - root->rb_node = NULL; + *root = RB_ROOT; else if (parent->rb_left == n) parent->rb_left = NULL; else if (parent->rb_right == n) diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index fcf7487..d7e9f74 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -43,9 +43,9 @@ * inode to disk. */ -int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) +int ext3_sync_file(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ext3_inode_info *ei = EXT3_I(inode); journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; int ret, needs_barrier = 0; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 0fc1293..6c953bb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb) struct ext3_super_block *es = sbi->s_es; int i, err; + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + lock_kernel(); ext3_xattr_put_super(sb); @@ -748,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot); static int ext3_mark_dquot_dirty(struct dquot *dquot); static int ext3_write_info(struct super_block *sb, int type); static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount); + char *path); static int ext3_quota_on_mount(struct super_block *sb, int type); static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -767,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = { static const struct quotactl_ops ext3_qctl_operations = { .quota_on = ext3_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk }; #endif @@ -1527,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) - vfs_quota_off(sb, i, 0); + dquot_quota_off(sb, i); } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -2551,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) ext3_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext3_mount_options old_opts; + int enable_quota = 0; int err; #ifdef CONFIG_QUOTA int i; @@ -2597,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) } if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount @@ -2651,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) goto restore_opts; if (!ext3_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + enable_quota = 1; } } #ifdef CONFIG_QUOTA @@ -2662,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #endif unlock_super(sb); unlock_kernel(); + + if (enable_quota) + dquot_resume(sb, -1); return 0; restore_opts: sb->s_flags = old_sb_flags; @@ -2851,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type) */ static int ext3_quota_on_mount(struct super_block *sb, int type) { - return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], - EXT3_SB(sb)->s_jquota_fmt, type); + return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], + EXT3_SB(sb)->s_jquota_fmt, type); } /* * Standard function to be called on quota_on */ static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *name, int remount) + char *name) { int err; struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - /* When remounting, no checks are needed and in fact, name is NULL */ - if (remount) - return vfs_quota_on(sb, type, format_id, name, remount); err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) @@ -2906,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, } } - err = vfs_quota_on_path(sb, type, format_id, &path); + err = dquot_quota_on_path(sb, type, format_id, &path); path_put(&path); return err; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d2f37a5..95b7594 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) *count = ar.len; - /* - * Account for the allocated meta blocks + * Account for the allocated meta blocks. We will never + * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + dquot_alloc_block_nofail(inode, ar.len); } return ret; } diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 538c486..5b6973f 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi, else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - if (start_blk + count > (entry->start_blk + + if (start_blk + count > (entry->start_blk + entry->count)) - entry->count = (start_blk + count - + entry->count = (start_blk + count - entry->start_blk); new_node = *n; new_entry = rb_entry(new_node, struct ext4_system_zone, diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 86cb6d8..ea5e6cb 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - __ext4_error(dir->i_sb, function, - "bad entry in directory #%lu: %s - block=%llu" + ext4_error_inode(function, dir, + "bad entry in directory: %s - block=%llu" "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - dir->i_ino, error_msg, - (unsigned long long) bh->b_blocknr, + error_msg, (unsigned long long) bh->b_blocknr, (unsigned) (offset%bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp, if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_COMPAT_DIR_INDEX) && - ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1))) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { @@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; + ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX); } stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); while (!error && !stored && filp->f_pos < inode->i_size) { - ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); - struct buffer_head map_bh; + struct ext4_map_blocks map; struct buffer_head *bh = NULL; - map_bh.b_state = 0; - err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); + map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_len = 1; + err = ext4_map_blocks(NULL, inode, &map, 0); if (err > 0) { - pgoff_t index = map_bh.b_blocknr >> + pgoff_t index = map.m_pblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); if (!ra_has_index(&filp->f_ra, index)) page_cache_sync_readahead( @@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp, &filp->f_ra, filp, index, 1); filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; - bh = ext4_bread(NULL, inode, blk, 0, &err); + bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); } /* @@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - ext4_error(sb, "directory #%lu " + EXT4_ERROR_INODE(inode, "directory " "contains a hole at offset %Lu", - inode->i_ino, (unsigned long long) filp->f_pos); dir_has_error = 1; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf938cf..19a4de5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -29,6 +29,9 @@ #include <linux/wait.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> +#ifdef __KERNEL__ +#include <linux/compat.h> +#endif /* * The fourth extended filesystem constants/structures @@ -54,10 +57,10 @@ #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode(__func__, (inode), (fmt), ## a); + ext4_error_inode(__func__, (inode), (fmt), ## a) #define EXT4_ERROR_FILE(file, fmt, a...) \ - ext4_error_file(__func__, (file), (fmt), ## a); + ext4_error_file(__func__, (file), (fmt), ## a) /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t; typedef unsigned int ext4_group_t; /* - * Flags used in mballoc's allocation_context flags field. + * Flags used in mballoc's allocation_context flags field. * * Also used to show what's going on for debugging purposes when the * flag field is exported via the traceport interface @@ -126,6 +129,29 @@ struct ext4_allocation_request { }; /* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_UNINIT (1 << BH_Uninit) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_UNINIT) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* * For delayed allocation tracking */ struct mpage_da_data { @@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) return flags & EXT4_OTHER_FLMASK; } +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ECOMPR = 11, /* Compression error */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ + printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ + EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } + +/* + * Since it's pretty easy to mix up bit numbers and hex values, and we + * can't do a compile-time test for ENUM values, we use a run-time + * test to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop + * out so it won't cost any extra space in the compiled kernel image. + * But it's important that these values are the same, since we are + * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL + * must be consistent with the values of FS_XXX_FL defined in + * include/linux/fs.h and the on-disk values found in ext2, ext3, and + * ext4 filesystems, and of course the values defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(RESERVED); +} + /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { __u32 group; /* Group number for this data */ @@ -332,6 +435,18 @@ struct ext4_new_group_input { __u16 unused; }; +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ struct ext4_new_group_data { __u32 group; @@ -355,7 +470,7 @@ struct ext4_new_group_data { #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path, - so set the magic i_delalloc_reserve_flag after taking the + so set the magic i_delalloc_reserve_flag after taking the inode allocation semaphore for */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an @@ -398,6 +513,7 @@ struct ext4_new_group_data { #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation */ @@ -408,11 +524,13 @@ struct ext4_new_group_data { #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) #ifdef CONFIG_JBD2_DEBUG #define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) #endif #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif /* @@ -616,9 +734,8 @@ struct ext4_ext_cache { */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ - __u32 i_flags; - ext4_fsblk_t i_file_acl; __u32 i_dtime; + ext4_fsblk_t i_file_acl; /* * i_block_group is the number of the block group which contains @@ -629,6 +746,7 @@ struct ext4_inode_info { */ ext4_group_t i_block_group; unsigned long i_state_flags; /* Dynamic state flags */ + unsigned long i_flags; ext4_lblk_t i_dir_start_lookup; #ifdef CONFIG_EXT4_FS_XATTR @@ -1062,22 +1180,25 @@ enum { EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ }; -static inline int ext4_test_inode_state(struct inode *inode, int bit) -{ - return test_bit(bit, &EXT4_I(inode)->i_state_flags); -} - -static inline void ext4_set_inode_state(struct inode *inode, int bit) -{ - set_bit(bit, &EXT4_I(inode)->i_state_flags); +#define EXT4_INODE_BIT_FNS(name, field) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit, &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit, &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit, &EXT4_I(inode)->i_##field); \ } -static inline void ext4_clear_inode_state(struct inode *inode, int bit) -{ - clear_bit(bit, &EXT4_I(inode)->i_state_flags); -} +EXT4_INODE_BIT_FNS(flag, flags) +EXT4_INODE_BIT_FNS(state, state_flags) #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 { #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ EXT4_FEATURE_COMPAT_DIR_INDEX) && \ - (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) @@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ -extern int ext4_sync_file(struct file *, struct dentry *, int); +extern int ext4_sync_file(struct file *, int); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -1678,6 +1799,7 @@ struct ext4_group_info { ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; @@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); -extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned int max_blocks, - struct buffer_head *bh_result, int flags); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); @@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int flags); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b79ad51..dade0c0 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode) return 1; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) return 1; - if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 1; return 0; } @@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode) return 0; if (!S_ISREG(inode->i_mode)) return 0; - if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return 1; @@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode) return 0; if (EXT4_JOURNAL(inode) == NULL) return 1; - if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return 1; @@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 0; if (!S_ISREG(inode->i_mode)) return 0; - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return 0; if (ext4_should_journal_data(inode)) return 0; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 236b834..377309c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, if (err <= 0) return err; err = ext4_truncate_restart_trans(handle, inode, needed); - /* - * We have dropped i_data_sem so someone might have cached again - * an extent we are going to truncate. - */ - ext4_ext_invalidate_cache(inode); + if (err == 0) + err = -EAGAIN; return err; } @@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { /* * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular * files will start at the second block group. This - * tends to speed up directory access and improves + * tends to speed up directory access and improves * fsck times. */ block_group &= ~(flex_size-1); @@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return 0; corrupted: - __ext4_error(inode->i_sb, function, - "bad header/extent in inode #%lu: %s - magic %x, " + ext4_error_inode(function, inode, + "bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", - inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), + error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); @@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode, merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) - ext4_error(inode->i_sb, - "inode#%lu, eh->eh_entries = 0!", - inode->i_ino); + EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); } return merge_done; @@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_ext_cache *cex; int ret = EXT4_EXT_CACHE_NO; - /* + /* * We borrow i_block_reservation_lock to protect i_cached_extent */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); @@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) int depth = ext_depth(inode); struct ext4_ext_path *path; handle_t *handle; - int i = 0, err = 0; + int i, err; ext_debug("truncate since %u\n", start); @@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) if (IS_ERR(handle)) return PTR_ERR(handle); +again: ext4_ext_invalidate_cache(inode); /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ + depth = ext_depth(inode); path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; } + path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; } - path[0].p_depth = depth; + i = err = 0; while (i >= 0 && err == 0) { if (i == depth) { @@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) out: ext4_ext_drop_refs(path); kfree(path); + if (err == -EAGAIN) + goto again; ext4_journal_stop(handle); return err; @@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error) /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { - int ret = -EIO; + int ret; struct bio *bio; int blkbits, blocksize; sector_t ee_pblock; @@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) len = ee_len; bio = bio_alloc(GFP_NOIO, len); + if (!bio) + return -ENOMEM; + bio->bi_sector = ee_pblock; bio->bi_bdev = inode->i_sb->s_bdev; @@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) submit_bio(WRITE, bio); wait_for_completion(&event); - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - ret = 0; - else { - ret = -EIO; - break; + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + bio_put(bio); + return -EIO; } bio_put(bio); ee_len -= done; ee_pblock += done << (blkbits - 9); } - return ret; + return 0; } #define EXT4_EXT_ZERO_LEN 7 /* - * This function is called by ext4_ext_get_blocks() if someone tries to write + * This function is called by ext4_ext_map_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized * extent into multiple extents (upto three - one initialized and two * uninitialized). @@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * c> Splits in three extents: Somone is writing in middle of the extent */ static int ext4_ext_convert_to_initialized(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t iblock, - unsigned int max_blocks) + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) { struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex1 = NULL; struct ext4_extent *ex2 = NULL; struct ext4_extent *ex3 = NULL; struct ext4_extent_header *eh; - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; int err = 0; int ret = 0; + int may_zeroout; + + ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (iblock - ee_block); - newblock = iblock - ee_block + ext_pblock(ex); + allocated = ee_len - (map->m_lblk - ee_block); + newblock = map->m_lblk - ee_block + ext_pblock(ex); + ex2 = ex; orig_ex.ee_block = ex->ee_block; orig_ex.ee_len = cpu_to_le16(ee_len); ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + may_zeroout = ee_block + ee_len <= eof_block; + err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { + if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, return allocated; } - /* ex1: ee_block to iblock - 1 : uninitialized */ - if (iblock > ee_block) { + /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ + if (map->m_lblk > ee_block) { ex1 = ex; - ex1->ee_len = cpu_to_le16(iblock - ee_block); + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_uninitialized(ex1); ex2 = &newex; } @@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * we insert ex3, if ex1 is NULL. This is to avoid temporary * overlap of blocks. */ - if (!ex1 && allocated > max_blocks) - ex2->ee_len = cpu_to_le16(max_blocks); + if (!ex1 && allocated > map->m_len) + ex2->ee_len = cpu_to_le16(map->m_len); /* ex3: to ee_block + ee_len : uninitialised */ - if (allocated > max_blocks) { + if (allocated > map->m_len) { unsigned int newdepth; /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ - if (allocated <= EXT4_EXT_ZERO_LEN) { + if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { /* - * iblock == ee_block is handled by the zerouout + * map->m_lblk == ee_block is handled by the zerouout * at the beginning. * Mark first half uninitialized. * Mark second half initialized and zero out the @@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_dirty(handle, inode, path + depth); ex3 = &newex; - ex3->ee_block = cpu_to_le32(iblock); + ex3->ee_block = cpu_to_le32(map->m_lblk); ext4_ext_store_pblock(ex3, newblock); ex3->ee_len = cpu_to_le16(allocated); err = ext4_ext_insert_extent(handle, inode, path, @@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_len = orig_ex.ee_len; ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); - /* blocks available from iblock */ + /* blocks available from map->m_lblk */ return allocated; } else if (err) @@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ depth = ext_depth(inode); ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, - iblock, path); + path = ext4_ext_find_extent(inode, map->m_lblk, + path); if (IS_ERR(path)) { err = PTR_ERR(path); return err; @@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, return allocated; } ex3 = &newex; - ex3->ee_block = cpu_to_le32(iblock + max_blocks); - ext4_ext_store_pblock(ex3, newblock + max_blocks); - ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); + ext4_ext_store_pblock(ex3, newblock + map->m_len); + ex3->ee_len = cpu_to_le16(allocated - map->m_len); ext4_ext_mark_uninitialized(ex3); err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); - if (err == -ENOSPC) { + if (err == -ENOSPC && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ - /* blocks available from iblock */ + /* blocks available from map->m_lblk */ return allocated; } else if (err) @@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * update the extent length after successful insert of the * split extent */ - orig_ex.ee_len = cpu_to_le16(ee_len - - ext4_ext_get_actual_len(ex3)); + ee_len -= ext4_ext_get_actual_len(ex3); + orig_ex.ee_len = cpu_to_le16(ee_len); + may_zeroout = ee_block + ee_len <= eof_block; + depth = newdepth; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, iblock, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (err) goto out; - allocated = max_blocks; + allocated = map->m_len; /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying * to insert a extent in the middle zerout directly * otherwise give the extent a chance to merge to left */ if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && - iblock != ee_block) { + map->m_lblk != ee_block && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ - /* blocks available from iblock */ + /* blocks available from map->m_lblk */ return allocated; } } @@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ if (ex1 && ex1 != ex) { ex1 = ex; - ex1->ee_len = cpu_to_le16(iblock - ee_block); + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_uninitialized(ex1); ex2 = &newex; } - /* ex2: iblock to iblock + maxblocks-1 : initialised */ - ex2->ee_block = cpu_to_le32(iblock); + /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ + ex2->ee_block = cpu_to_le32(map->m_lblk); ext4_ext_store_pblock(ex2, newblock); ex2->ee_len = cpu_to_le16(allocated); if (ex2 != ex) @@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto out; insert: err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); - if (err == -ENOSPC) { + if (err == -ENOSPC && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -2904,7 +2923,7 @@ fix_extent_len: } /* - * This function is called by ext4_ext_get_blocks() from + * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write * to an uninitialized extent. * @@ -2927,9 +2946,8 @@ fix_extent_len: */ static int ext4_split_unwritten_extents(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, struct ext4_ext_path *path, - ext4_lblk_t iblock, - unsigned int max_blocks, int flags) { struct ext4_extent *ex, newex, orig_ex; @@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle, struct ext4_extent *ex2 = NULL; struct ext4_extent *ex3 = NULL; struct ext4_extent_header *eh; - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; int err = 0; + int may_zeroout; + + ext_debug("ext4_split_unwritten_extents: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; - ext_debug("ext4_split_unwritten_extents: inode %lu," - "iblock %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)iblock, max_blocks); depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (iblock - ee_block); - newblock = iblock - ee_block + ext_pblock(ex); + allocated = ee_len - (map->m_lblk - ee_block); + newblock = map->m_lblk - ee_block + ext_pblock(ex); + ex2 = ex; orig_ex.ee_block = ex->ee_block; orig_ex.ee_len = cpu_to_le16(ee_len); ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + may_zeroout = ee_block + ee_len <= eof_block; + + /* * If the uninitialized extent begins at the same logical * block where the write begins, and the write completely * covers the extent, then we don't need to split it. */ - if ((iblock == ee_block) && (allocated <= max_blocks)) + if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) return allocated; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - /* ex1: ee_block to iblock - 1 : uninitialized */ - if (iblock > ee_block) { + /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ + if (map->m_lblk > ee_block) { ex1 = ex; - ex1->ee_len = cpu_to_le16(iblock - ee_block); + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_uninitialized(ex1); ex2 = &newex; } @@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle, * we insert ex3, if ex1 is NULL. This is to avoid temporary * overlap of blocks. */ - if (!ex1 && allocated > max_blocks) - ex2->ee_len = cpu_to_le16(max_blocks); + if (!ex1 && allocated > map->m_len) + ex2->ee_len = cpu_to_le16(map->m_len); /* ex3: to ee_block + ee_len : uninitialised */ - if (allocated > max_blocks) { + if (allocated > map->m_len) { unsigned int newdepth; ex3 = &newex; - ex3->ee_block = cpu_to_le32(iblock + max_blocks); - ext4_ext_store_pblock(ex3, newblock + max_blocks); - ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); + ext4_ext_store_pblock(ex3, newblock + map->m_len); + ex3->ee_len = cpu_to_le16(allocated - map->m_len); ext4_ext_mark_uninitialized(ex3); err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); - if (err == -ENOSPC) { + if (err == -ENOSPC && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ - /* blocks available from iblock */ + /* blocks available from map->m_lblk */ return allocated; } else if (err) @@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle, * update the extent length after successful insert of the * split extent */ - orig_ex.ee_len = cpu_to_le16(ee_len - - ext4_ext_get_actual_len(ex3)); + ee_len -= ext4_ext_get_actual_len(ex3); + orig_ex.ee_len = cpu_to_le16(ee_len); + may_zeroout = ee_block + ee_len <= eof_block; + depth = newdepth; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, iblock, path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle, if (err) goto out; - allocated = max_blocks; + allocated = map->m_len; } /* * If there was a change of depth as part of the @@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle, */ if (ex1 && ex1 != ex) { ex1 = ex; - ex1->ee_len = cpu_to_le16(iblock - ee_block); + ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); ext4_ext_mark_uninitialized(ex1); ex2 = &newex; } /* - * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, - * uninitialised still. + * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written + * using direct I/O, uninitialised still. */ - ex2->ee_block = cpu_to_le32(iblock); + ex2->ee_block = cpu_to_le32(map->m_lblk); ext4_ext_store_pblock(ex2, newblock); ex2->ee_len = cpu_to_le16(allocated); ext4_ext_mark_uninitialized(ex2); @@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle, goto out; insert: err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err == -ENOSPC) { + if (err == -ENOSPC && may_zeroout) { err = ext4_ext_zeroout(inode, &orig_ex); if (err) goto fix_extent_len; @@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev, static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned int max_blocks, + struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, - unsigned int allocated, struct buffer_head *bh_result, - ext4_fsblk_t newblock) + unsigned int allocated, ext4_fsblk_t newblock) { int ret = 0; int err = 0; @@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" "block %llu, max_blocks %u, flags %d, allocated %u", - inode->i_ino, (unsigned long long)iblock, max_blocks, + inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, - inode, path, iblock, - max_blocks, flags); + ret = ext4_split_unwritten_extents(handle, inode, map, + path, flags); /* * Flag the inode(non aio case) or end_io struct (aio case) * that this IO needs to convertion to written when IO is @@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); if (ext4_should_dioread_nolock(inode)) - set_buffer_uninit(bh_result); + map->m_flags |= EXT4_MAP_UNINIT; goto out; } /* IO end_io complete, convert the filled extent to written */ @@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * the buffer head will be unmapped so that * a read from the block returns 0s. */ - set_buffer_unwritten(bh_result); + map->m_flags |= EXT4_MAP_UNWRITTEN; goto out1; } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, - path, iblock, - max_blocks); + ret = ext4_ext_convert_to_initialized(handle, inode, map, path); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -3226,7 +3256,7 @@ out: goto out2; } else allocated = ret; - set_buffer_new(bh_result); + map->m_flags |= EXT4_MAP_NEW; /* * if we allocated more blocks than requested * we need to make sure we unmap the extra block @@ -3234,11 +3264,11 @@ out: * unmapped later when we find the buffer_head marked * new. */ - if (allocated > max_blocks) { + if (allocated > map->m_len) { unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, - newblock + max_blocks, - allocated - max_blocks); - allocated = max_blocks; + newblock + map->m_len, + allocated - map->m_len); + allocated = map->m_len; } /* @@ -3252,13 +3282,13 @@ out: ext4_da_update_reserve_space(inode, allocated, 0); map_out: - set_buffer_mapped(bh_result); + map->m_flags |= EXT4_MAP_MAPPED; out1: - if (allocated > max_blocks) - allocated = max_blocks; + if (allocated > map->m_len) + allocated = map->m_len; ext4_ext_show_leaf(inode, path); - bh_result->b_bdev = inode->i_sb->s_bdev; - bh_result->b_blocknr = newblock; + map->m_pblk = newblock; + map->m_len = allocated; out2: if (path) { ext4_ext_drop_refs(path); @@ -3284,26 +3314,23 @@ out2: * * return < 0, error case. */ -int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, - unsigned int max_blocks, struct buffer_head *bh_result, - int flags) +int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; struct ext4_extent newex, *ex, *last_ex; ext4_fsblk_t newblock; - int err = 0, depth, ret, cache_type; + int i, err = 0, depth, ret, cache_type; unsigned int allocated = 0; struct ext4_allocation_request ar; ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%u requested for inode %lu\n", - iblock, max_blocks, inode->i_ino); + map->m_lblk, map->m_len, inode->i_ino); /* check in cache */ - cache_type = ext4_ext_in_cache(inode, iblock, &newex); + cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex); if (cache_type) { if (cache_type == EXT4_EXT_CACHE_GAP) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* we should allocate requested block */ } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { /* block is already allocated */ - newblock = iblock + newblock = map->m_lblk - le32_to_cpu(newex.ee_block) + ext_pblock(&newex); /* number of remaining blocks in the extent */ allocated = ext4_ext_get_actual_len(&newex) - - (iblock - le32_to_cpu(newex.ee_block)); + (map->m_lblk - le32_to_cpu(newex.ee_block)); goto out; } else { BUG(); @@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, } /* find extent for this block */ - path = ext4_ext_find_extent(inode, iblock, NULL); + path = ext4_ext_find_extent(inode, map->m_lblk, NULL); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " - "iblock: %d, depth: %d pblock %lld", - iblock, depth, path[depth].p_block); + "lblock: %lu, depth: %d pblock %lld", + (unsigned long) map->m_lblk, depth, + path[depth].p_block); err = -EIO; goto out2; } @@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ - if (in_range(iblock, ee_block, ee_len)) { - newblock = iblock - ee_block + ee_start; + if (in_range(map->m_lblk, ee_block, ee_len)) { + newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ - allocated = ee_len - (iblock - ee_block); - ext_debug("%u fit into %u:%d -> %llu\n", iblock, - ee_block, ee_len, newblock); + allocated = ee_len - (map->m_lblk - ee_block); + ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, + ee_block, ee_len, newblock); /* Do not put uninitialized extent in the cache */ if (!ext4_ext_is_uninitialized(ex)) { @@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out; } ret = ext4_ext_handle_uninitialized_extents(handle, - inode, iblock, max_blocks, path, - flags, allocated, bh_result, newblock); + inode, map, path, flags, allocated, + newblock); return ret; } } @@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, iblock); + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } /* @@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ /* find neighbour allocated blocks */ - ar.lleft = iblock; + ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) goto out2; - ar.lright = iblock; + ar.lright = map->m_lblk; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); if (err) goto out2; @@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is * EXT_UNINIT_MAX_LEN. */ - if (max_blocks > EXT_INIT_MAX_LEN && + if (map->m_len > EXT_INIT_MAX_LEN && !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - max_blocks = EXT_INIT_MAX_LEN; - else if (max_blocks > EXT_UNINIT_MAX_LEN && + map->m_len = EXT_INIT_MAX_LEN; + else if (map->m_len > EXT_UNINIT_MAX_LEN && (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - max_blocks = EXT_UNINIT_MAX_LEN; + map->m_len = EXT_UNINIT_MAX_LEN; - /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ - newex.ee_block = cpu_to_le32(iblock); - newex.ee_len = cpu_to_le16(max_blocks); + /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ + newex.ee_block = cpu_to_le32(map->m_lblk); + newex.ee_len = cpu_to_le16(map->m_len); err = ext4_ext_check_overlap(inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else - allocated = max_blocks; + allocated = map->m_len; /* allocate new block */ ar.inode = inode; - ar.goal = ext4_ext_find_goal(inode, path, iblock); - ar.logical = iblock; + ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); + ar.logical = map->m_lblk; ar.len = allocated; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; @@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_STATE_DIO_UNWRITTEN); } if (ext4_should_dioread_nolock(inode)) - set_buffer_uninit(bh_result); + map->m_flags |= EXT4_MAP_UNINIT; } - if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { + if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { if (unlikely(!eh->eh_entries)) { EXT4_ERROR_INODE(inode, - "eh->eh_entries == 0 ee_block %d", - ex->ee_block); + "eh->eh_entries == 0 and " + "EOFBLOCKS_FL set"); err = -EIO; goto out2; } last_ex = EXT_LAST_EXTENT(eh); - if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) - + ext4_ext_get_actual_len(last_ex)) - EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + /* + * If the current leaf block was reached by looking at + * the last index block all the way down the tree, and + * we are extending the inode beyond the last extent + * in the current leaf block, then clear the + * EOFBLOCKS_FL flag. + */ + for (i = depth-1; i >= 0; i--) { + if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) + break; + } + if ((i < 0) && + (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex))) + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { @@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); - if (allocated > max_blocks) - allocated = max_blocks; - set_buffer_new(bh_result); + if (allocated > map->m_len) + allocated = map->m_len; + map->m_flags |= EXT4_MAP_NEW; /* * Update reserved blocks/metadata blocks after successful @@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * when it is _not_ an uninitialized extent. */ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, iblock, allocated, newblock, + ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock, EXT4_EXT_CACHE_EXTENT); ext4_update_inode_fsync_trans(handle, inode, 1); } else ext4_update_inode_fsync_trans(handle, inode, 0); out: - if (allocated > max_blocks) - allocated = max_blocks; + if (allocated > map->m_len) + allocated = map->m_len; ext4_ext_show_leaf(inode, path); - set_buffer_mapped(bh_result); - bh_result->b_bdev = inode->i_sb->s_bdev; - bh_result->b_blocknr = newblock; + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + map->m_len = allocated; out2: if (path) { ext4_ext_drop_refs(path); @@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode, * can proceed even if the new size is the same as i_size. */ if (new_size > i_size_read(inode)) - EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } } @@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode, long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) { handle_t *handle; - ext4_lblk_t block; loff_t new_size; unsigned int max_blocks; int ret = 0; int ret2 = 0; int retries = 0; - struct buffer_head map_bh; + struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; /* * currently supporting (pre)allocate mode for extent-based * files _only_ */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; /* preallocation to directories is currently not supported */ if (S_ISDIR(inode->i_mode)) return -ENODEV; - block = offset >> blkbits; + map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - block; + - map.m_lblk; /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, max_blocks); mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } retry: while (ret >= 0 && ret < max_blocks) { - block = block + ret; - max_blocks = max_blocks - ret; + map.m_lblk = map.m_lblk + ret; + map.m_len = max_blocks = max_blocks - ret; handle = ext4_journal_start(inode, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } - map_bh.b_state = 0; - ret = ext4_get_blocks(handle, inode, block, - max_blocks, &map_bh, + ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_get_blocks " + printk(KERN_ERR "%s: ext4_ext_map_blocks " "returned error inode#%lu, block=%u, " "max_blocks=%u", __func__, inode->i_ino, block, max_blocks); @@ -3697,14 +3739,14 @@ retry: ret2 = ext4_journal_stop(handle); break; } - if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, + if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, blkbits) >> blkbits)) new_size = offset + len; else - new_size = (block + ret) << blkbits; + new_size = (map.m_lblk + ret) << blkbits; ext4_falloc_update_inode(inode, mode, new_size, - buffer_new(&map_bh)); + (map.m_flags & EXT4_MAP_NEW)); ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); if (ret2) @@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len) { handle_t *handle; - ext4_lblk_t block; unsigned int max_blocks; int ret = 0; int ret2 = 0; - struct buffer_head map_bh; + struct ext4_map_blocks map; unsigned int credits, blkbits = inode->i_blkbits; - block = offset >> blkbits; + map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ - max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - block; + max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - + map.m_lblk); /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, max_blocks); while (ret >= 0 && ret < max_blocks) { - block = block + ret; - max_blocks = max_blocks - ret; + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); handle = ext4_journal_start(inode, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } - map_bh.b_state = 0; - ret = ext4_get_blocks(handle, inode, block, - max_blocks, &map_bh, + ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) { WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_get_blocks " + printk(KERN_ERR "%s: ext4_ext_map_blocks " "returned error inode#%lu, block=%u, " "max_blocks=%u", __func__, - inode->i_ino, block, max_blocks); + inode->i_ino, map.m_lblk, map.m_len); } ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); @@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int error = 0; /* fallback to generic here if not in extents fmt */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, ext4_get_block); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d0776e4..5313ae4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, * is smaller than s_maxbytes, which is for extent-mapped files. */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); size_t length = iov_length(iov, nr_segs); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index ef3d980..592adf2 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -35,6 +35,29 @@ #include <trace/events/ext4.h> /* + * If we're not journaling and this is a just-created file, we have to + * sync our parent directory (if it was freshly created) since + * otherwise it will only be written by writeback, leaving a huge + * window during which a crash may lose the file. This may apply for + * the parent directory's parent as well, and so on recursively, if + * they are also freshly created. + */ +static void ext4_sync_parent(struct inode *inode) +{ + struct dentry *dentry = NULL; + + while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); + dentry = list_entry(inode->i_dentry.next, + struct dentry, d_alias); + if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) + break; + inode = dentry->d_parent->d_inode; + sync_mapping_buffers(inode->i_mapping); + } +} + +/* * akpm: A new design for ext4_sync_file(). * * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). @@ -48,9 +71,9 @@ * i_mutex lock is held when entering and exiting this function */ -int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) +int ext4_sync_file(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret; @@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); - trace_ext4_sync_file(file, dentry, datasync); + trace_ext4_sync_file(file, datasync); if (inode->i_sb->s_flags & MS_RDONLY) return 0; @@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) ret = flush_completed_IO(inode); if (ret < 0) return ret; - - if (!journal) - return simple_fsync(file, dentry, datasync); + + if (!journal) { + ret = generic_file_fsync(file, datasync); + if (!ret && !list_empty(&inode->i_dentry)) + ext4_sync_parent(inode); + return ret; + } /* * data=writeback,ordered: @@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) (journal->j_flags & JBD2_BARRIER)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); - jbd2_log_wait_commit(journal, commit_tid); + ret = jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1a0e183..25c4b31 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (fatal) goto error_return; - /* Ok, now we can actually update the inode bitmaps.. */ - cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), - bit, bitmap_bh->b_data); - if (!cleared) - ext4_error(sb, "bit already cleared for inode %lu", ino); - else { - gdp = ext4_get_group_desc(sb, block_group, &bh2); - + fatal = -ESRCH; + gdp = ext4_get_group_desc(sb, block_group, &bh2); + if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); fatal = ext4_journal_get_write_access(handle, bh2); - if (fatal) goto error_return; - - if (gdp) { - ext4_lock_group(sb, block_group); - count = ext4_free_inodes_count(sb, gdp) + 1; - ext4_free_inodes_set(sb, gdp, count); - if (is_directory) { - count = ext4_used_dirs_count(sb, gdp) - 1; - ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f; - - f = ext4_flex_group(sbi, block_group); - atomic_dec(&sbi->s_flex_groups[f].used_dirs); - } + } + ext4_lock_group(sb, block_group); + cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + if (fatal || !cleared) { + ext4_unlock_group(sb, block_group); + goto out; + } - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, - block_group, gdp); - ext4_unlock_group(sb, block_group); - percpu_counter_inc(&sbi->s_freeinodes_counter); - if (is_directory) - percpu_counter_dec(&sbi->s_dirs_counter); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t f; - - f = ext4_flex_group(sbi, block_group); - atomic_inc(&sbi->s_flex_groups[f].free_inodes); - } - } - BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, bh2); - if (!fatal) fatal = err; + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + percpu_counter_dec(&sbi->s_dirs_counter); } - BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (!fatal) - fatal = err; - sb->s_dirt = 1; + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_unlock_group(sb, block_group); + + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, block_group); + + atomic_inc(&sbi->s_flex_groups[f].free_inodes); + if (is_directory) + atomic_dec(&sbi->s_flex_groups[f].used_dirs); + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); +out: + if (cleared) { + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!fatal) + fatal = err; + sb->s_dirt = 1; + } else + ext4_error(sb, "bit already cleared for inode %lu", ino); + error_return: brelse(bitmap_bh); ext4_std_error(sb, fatal); @@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, if (S_ISDIR(mode) && ((parent == sb->s_root->d_inode) || - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { + (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; @@ -1041,7 +1034,7 @@ got: if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { - EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e0f6af..19df61c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, int ret; /* - * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_mutex. So we can safely drop the i_data_sem here. @@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - __ext4_error(inode->i_sb, function, - "invalid block reference %u " - "in inode #%lu", blk, inode->i_ino); + ext4_error_inode(function, inode, + "invalid block reference %u", blk); return -EIO; } } @@ -785,7 +784,7 @@ failed: /* Allocation failed, free what we already allocated */ ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { - /* + /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. @@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, err_out: for (i = 1; i <= num; i++) { - /* + /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. @@ -890,9 +889,9 @@ err_out: } /* - * The ext4_ind_get_blocks() function handles non-extents inodes + * The ext4_ind_map_blocks() function handles non-extents inodes * (i.e., using the traditional indirect/double-indirect i_blocks - * scheme) for ext4_get_blocks(). + * scheme) for ext4_map_blocks(). * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything @@ -917,9 +916,8 @@ err_out: * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system * blocks. */ -static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned int maxblocks, - struct buffer_head *bh_result, +static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { int err = -EIO; @@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, int count = 0; ext4_fsblk_t first_block = 0; - J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); - depth = ext4_block_to_path(inode, iblock, offsets, + depth = ext4_block_to_path(inode, map->m_lblk, offsets, &blocks_to_boundary); if (depth == 0) @@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); - clear_buffer_new(bh_result); count++; /*map more blocks*/ - while (count < maxblocks && count <= blocks_to_boundary) { + while (count < map->m_len && count <= blocks_to_boundary) { ext4_fsblk_t blk; blk = le32_to_cpu(*(chain[depth-1].p + count)); @@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, /* * Okay, we need to do block allocation. */ - goal = ext4_find_goal(inode, iblock, partial); + goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; @@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, * direct blocks to allocate for this branch. */ count = ext4_blks_to_allocate(partial, indirect_blks, - maxblocks, blocks_to_boundary); + map->m_len, blocks_to_boundary); /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) - err = ext4_splice_branch(handle, inode, iblock, + err = ext4_splice_branch(handle, inode, map->m_lblk, partial, indirect_blks, count); if (err) goto cleanup; - set_buffer_new(bh_result); + map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; if (count > blocks_to_boundary) - set_buffer_boundary(bh_result); + map->m_flags |= EXT4_MAP_BOUNDARY; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ @@ -1016,7 +1015,6 @@ cleanup: brelse(partial->bh); partial--; } - BUFFER_TRACE(bh_result, "returned"); out: return err; } @@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, */ static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) { - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); return ext4_indirect_calc_metadata_amount(inode, lblock); @@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - int mdb_free = 0, allocated_meta_blocks = 0; spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used); @@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; - used += ei->i_allocated_meta_blocks; ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - allocated_meta_blocks = ei->i_allocated_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + used + ei->i_allocated_meta_blocks); ei->i_allocated_meta_blocks = 0; - percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); if (ei->i_reserved_data_blocks == 0) { /* @@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode, * only when we have written all of the delayed * allocation blocks. */ - mdb_free = ei->i_reserved_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; - percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); } spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - /* Update quota subsystem */ - if (quota_claim) { + /* Update quota subsystem for data blocks */ + if (quota_claim) dquot_claim_block(inode, used); - if (mdb_free) - dquot_release_reservation_block(inode, mdb_free); - } else { + else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should - * not update the quota for allocated blocks. But then - * converting an fallocate region to initialized region would - * have caused a metadata allocation. So claim quota for - * that + * not re-claim the quota for fallocated blocks. */ - if (allocated_meta_blocks) - dquot_claim_block(inode, allocated_meta_blocks); - dquot_release_reservation_block(inode, mdb_free + used); + dquot_release_reservation_block(inode, used); } /* @@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode, ext4_discard_preallocations(inode); } -static int check_block_validity(struct inode *inode, const char *msg, - sector_t logical, sector_t phys, int len) +static int check_block_validity(struct inode *inode, const char *func, + struct ext4_map_blocks *map) { - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - __ext4_error(inode->i_sb, msg, - "inode #%lu logical block %llu mapped to %llu " - "(size %d)", inode->i_ino, - (unsigned long long) logical, - (unsigned long long) phys, len); + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, + map->m_len)) { + ext4_error_inode(func, inode, + "lblock %lu mapped to illegal pblock %llu " + "(length %d)", (unsigned long) map->m_lblk, + map->m_pblk, map->m_len); return -EIO; } return 0; @@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, } /* - * The ext4_get_blocks() function tries to look up the requested blocks, + * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * - * If file type is extents based, it will call ext4_ext_get_blocks(), - * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping + * If file type is extents based, it will call ext4_ext_map_blocks(), + * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocate. @@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, * * It returns the error in case of allocation failure. */ -int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, - unsigned int max_blocks, struct buffer_head *bh, - int flags) +int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) { int retval; - clear_buffer_mapped(bh); - clear_buffer_unwritten(bh); - - ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," - "logical block %lu\n", inode->i_ino, flags, max_blocks, - (unsigned long)block); + map->m_flags = 0; + ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," + "logical block %lu\n", inode->i_ino, flags, map->m_len, + (unsigned long) map->m_lblk); /* * Try to see if we can get the block without requesting a new * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, 0); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { - retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, - bh, 0); + retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, "file system corruption", - block, bh->b_blocknr, retval); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret = check_block_validity(inode, __func__, map); if (ret != 0) return ret; } @@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * ext4_ext_get_block() returns th create = 0 * with buffer head unmapped. */ - if (retval > 0 && buffer_mapped(bh)) + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) return retval; /* @@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * of BH_Unwritten and BH_Mapped flags being simultaneously * set on the buffer_head. */ - clear_buffer_unwritten(bh); + map->m_flags &= ~EXT4_MAP_UNWRITTEN; /* * New blocks allocate and/or writing to uninitialized extent @@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * We need to check for EXT4 here because migrate * could have changed the inode type in between */ - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { - retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, flags); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { - retval = ext4_ind_get_blocks(handle, inode, block, - max_blocks, bh, flags); + retval = ext4_ind_map_blocks(handle, inode, map, flags); - if (retval > 0 && buffer_new(bh)) { + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { /* * We allocated new blocks which will result in * i_data's format changing. Force the migrate @@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, EXT4_I(inode)->i_delalloc_reserved_flag = 0; up_write((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, "file system " - "corruption after allocation", - block, bh->b_blocknr, retval); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret = check_block_validity(inode, + "ext4_map_blocks_after_alloc", + map); if (ret != 0) return ret; } @@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 -int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int _ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int flags) { handle_t *handle = ext4_journal_current_handle(); + struct ext4_map_blocks map; int ret = 0, started = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; - if (create && !handle) { + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + + if (flags && !handle) { /* Direct IO write... */ - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); handle = ext4_journal_start(inode, dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out; + return ret; } started = 1; } - ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, - create ? EXT4_GET_BLOCKS_CREATE : 0); + ret = ext4_map_blocks(handle, inode, &map, flags); if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } if (started) ext4_journal_stop(handle); -out: return ret; } +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); +} + /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int create, int *errp) { - struct buffer_head dummy; + struct ext4_map_blocks map; + struct buffer_head *bh; int fatal = 0, err; - int flags = 0; J_ASSERT(handle != NULL || create == 0); - dummy.b_state = 0; - dummy.b_blocknr = -1000; - buffer_trace_init(&dummy.b_history); - if (create) - flags |= EXT4_GET_BLOCKS_CREATE; - err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); - /* - * ext4_get_blocks() returns number of blocks mapped. 0 in - * case of a HOLE. - */ - if (err > 0) { - if (err > 1) - WARN_ON(1); - err = 0; + map.m_lblk = block; + map.m_len = 1; + err = ext4_map_blocks(handle, inode, &map, + create ? EXT4_GET_BLOCKS_CREATE : 0); + + if (err < 0) + *errp = err; + if (err <= 0) + return NULL; + *errp = 0; + + bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!bh) { + *errp = -EIO; + return NULL; } - *errp = err; - if (!err && buffer_mapped(&dummy)) { - struct buffer_head *bh; - bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (!bh) { - *errp = -EIO; - goto err; - } - if (buffer_new(&dummy)) { - J_ASSERT(create != 0); - J_ASSERT(handle != NULL); + if (map.m_flags & EXT4_MAP_NEW) { + J_ASSERT(create != 0); + J_ASSERT(handle != NULL); - /* - * Now that we do not always journal data, we should - * keep in mind whether this should always journal the - * new buffer as metadata. For now, regular file - * writes use ext4_get_block instead, so it's not a - * problem. - */ - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - fatal = ext4_journal_get_create_access(handle, bh); - if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data, 0, inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - } - unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (!fatal) - fatal = err; - } else { - BUFFER_TRACE(bh, "not a new buffer"); - } - if (fatal) { - *errp = fatal; - brelse(bh); - bh = NULL; + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext4_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext4_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); } - return bh; + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); } -err: - return NULL; + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, @@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long md_needed, md_reserved; + unsigned long md_needed; int ret; /* @@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) */ repeat: spin_lock(&ei->i_block_reservation_lock); - md_reserved = ei->i_reserved_meta_blocks; md_needed = ext4_calc_metadata_amount(inode, lblock); trace_ext4_da_reserve_space(inode, md_needed); spin_unlock(&ei->i_block_reservation_lock); /* - * Make quota reservation here to prevent quota overflow - * later. Real quota accounting is done at pages writeout - * time. + * We will charge metadata quota at writeout time; this saves + * us from metadata over-estimation, though we may go over by + * a small amount in the end. Here we just reserve for data. */ - ret = dquot_reserve_block(inode, md_needed + 1); + ret = dquot_reserve_block(inode, 1); if (ret) return ret; - + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ if (ext4_claim_free_blocks(sbi, md_needed + 1)) { - dquot_release_reservation_block(inode, md_needed + 1); + dquot_release_reservation_block(inode, 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the @@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free) * only when we have written all of the delayed * allocation blocks. */ - to_free += ei->i_reserved_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ei->i_reserved_meta_blocks); ei->i_reserved_meta_blocks = 0; ei->i_da_metadata_calc_len = 0; } - /* update fs dirty blocks counter */ + /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); @@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) /* * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers * - * @mpd->inode - inode to walk through - * @exbh->b_blocknr - first block on a disk - * @exbh->b_size - amount of space in bytes - * @logical - first logical block to start assignment with - * * the function goes through all passed space and put actual disk * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten */ -static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, - struct buffer_head *exbh) +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, + struct ext4_map_blocks *map) { struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - int blocks = exbh->b_size >> inode->i_blkbits; - sector_t pblock = exbh->b_blocknr, cur_logical; + int blocks = map->m_len; + sector_t pblock = map->m_pblk, cur_logical; struct buffer_head *head, *bh; pgoff_t index, end; struct pagevec pvec; int nr_pages, i; - index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); pagevec_init(&pvec, 0); @@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, /* skip blocks out of the range */ do { - if (cur_logical >= logical) + if (cur_logical >= map->m_lblk) break; cur_logical++; } while ((bh = bh->b_this_page) != head); do { - if (cur_logical >= logical + blocks) + if (cur_logical >= map->m_lblk + blocks) break; - if (buffer_delay(bh) || - buffer_unwritten(bh)) { + if (buffer_delay(bh) || buffer_unwritten(bh)) { BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); @@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); - if (buffer_uninit(exbh)) + if (map->m_flags & EXT4_MAP_UNINIT) set_buffer_uninit(bh); cur_logical++; pblock++; @@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, } -/* - * __unmap_underlying_blocks - just a helper function to unmap - * set of blocks described by @bh - */ -static inline void __unmap_underlying_blocks(struct inode *inode, - struct buffer_head *bh) -{ - struct block_device *bdev = inode->i_sb->s_bdev; - int blocks, i; - - blocks = bh->b_size >> inode->i_blkbits; - for (i = 0; i < blocks; i++) - unmap_underlying_metadata(bdev, bh->b_blocknr + i); -} - static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, sector_t logical, long blk_cnt) { @@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode) static int mpage_da_map_blocks(struct mpage_da_data *mpd) { int err, blks, get_blocks_flags; - struct buffer_head new; + struct ext4_map_blocks map; sector_t next = mpd->b_blocknr; unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; loff_t disksize = EXT4_I(mpd->inode)->i_disksize; @@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting * variables are updated after the blocks have been allocated. */ - new.b_state = 0; + map.m_lblk = next; + map.m_len = max_blocks; get_blocks_flags = EXT4_GET_BLOCKS_CREATE; if (ext4_should_dioread_nolock(mpd->inode)) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, - &new, get_blocks_flags); + blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); if (blks < 0) { err = blks; /* @@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) ext4_msg(mpd->inode->i_sb, KERN_CRIT, "delayed block allocation failed for inode %lu at " "logical offset %llu with max blocks %zd with " - "error %d\n", mpd->inode->i_ino, + "error %d", mpd->inode->i_ino, (unsigned long long) next, mpd->b_size >> mpd->inode->i_blkbits, err); printk(KERN_CRIT "This should not happen!! " @@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) } BUG_ON(blks == 0); - new.b_size = (blks << mpd->inode->i_blkbits); + if (map.m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = mpd->inode->i_sb->s_bdev; + int i; - if (buffer_new(&new)) - __unmap_underlying_blocks(mpd->inode, &new); + for (i = 0; i < map.m_len; i++) + unmap_underlying_metadata(bdev, map.m_pblk + i); + } /* * If blocks are delayed marked, we need to @@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) */ if ((mpd->b_state & (1 << BH_Delay)) || (mpd->b_state & (1 << BH_Unwritten))) - mpage_put_bnr_to_bhs(mpd, next, &new); + mpage_put_bnr_to_bhs(mpd, &map); if (ext4_should_order_data(mpd->inode)) { err = ext4_jbd2_file_inode(handle, mpd->inode); @@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t next; int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; + /* + * XXX Don't go larger than mballoc is willing to allocate + * This is a stopgap solution. We eventually need to fold + * mpage_da_submit_io() into this function and then call + * ext4_get_blocks() multiple times in a loop + */ + if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) + goto flush_it; + /* check if thereserved journal credits might overflow */ - if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { if (nrblocks >= EXT4_MAX_TRANS_DATA) { /* * With non-extent format we are limited by the journal @@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page, struct buffer_head *bh, *head; sector_t logical; - if (mpd->io_done) { - /* - * Rest of the page in the page_vec - * redirty then and skip then. We will - * try to write them again after - * starting a new transaction - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return MPAGE_DA_EXTENT_TAIL; - } /* * Can we merge this page to current extent? */ @@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page, * initialized properly. */ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + struct buffer_head *bh, int create) { + struct ext4_map_blocks map; int ret = 0; sector_t invalid_block = ~((sector_t) 0xffff); @@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, invalid_block = ~0; BUG_ON(create == 0); - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + + map.m_lblk = iblock; + map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); - if ((ret == 0) && !buffer_delay(bh_result)) { - /* the block isn't (pre)allocated yet, let's reserve space */ + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) { + if (buffer_delay(bh)) + return 0; /* Not sure this could or should happen */ /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? @@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* not enough space to reserve */ return ret; - map_bh(bh_result, inode->i_sb, invalid_block); - set_buffer_new(bh_result); - set_buffer_delay(bh_result); - } else if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - if (buffer_unwritten(bh_result)) { - /* A delayed write to unwritten bh should - * be marked new and mapped. Mapped ensures - * that we don't do get_block multiple times - * when we write to the same offset and new - * ensures that we do proper zero out for - * partial write. - */ - set_buffer_new(bh_result); - set_buffer_mapped(bh_result); - } - ret = 0; + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; } - return ret; + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + + if (buffer_unwritten(bh)) { + /* A delayed write to unwritten bh should be marked + * new and mapped. Mapped ensures that we don't do + * get_block multiple times when we write to the same + * offset and new ensures that we do proper zero out + * for partial write. + */ + set_buffer_new(bh); + set_buffer_mapped(bh); + } + return 0; } /* @@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, static int noalloc_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - int ret = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); - - /* - * we don't want to do block allocation in writepage - * so call get_block_wrap with create = 0 - */ - ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - return ret; + return _ext4_get_block(inode, iblock, bh_result, 0); } static int bget_one(handle_t *handle, struct buffer_head *bh) @@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * number of contiguous block. So we will limit * number of contiguous block to a sane value */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && (max_blocks > EXT4_MAX_TRANS_DATA)) max_blocks = EXT4_MAX_TRANS_DATA; return ext4_chunk_trans_blocks(inode, max_blocks); } +/* + * write_cache_pages_da - walk the list of dirty pages of the given + * address space and call the callback function (which usually writes + * the pages). + * + * This is a forked version of write_cache_pages(). Differences: + * Range cyclic is ignored. + * no_nrwrite_index_update is always presumed true + */ +static int write_cache_pages_da(struct address_space *mapping, + struct writeback_control *wbc, + struct mpage_da_data *mpd) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + long nr_to_write = wbc->nr_to_write; + + pagevec_init(&pvec, 0); + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + done = 1; + break; + } + + lock_page(page); + + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = __mpage_da_writepage(page, wbc, mpd); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + done = 1; + break; + } + } + + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } + } + } + pagevec_release(&pvec); + cond_resched(); + } + return ret; +} + + static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping, handle_t *handle = NULL; struct mpage_da_data mpd; struct inode *inode = mapping->host; - int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; unsigned int max_pages; @@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping, mpd.wbc = wbc; mpd.inode = mapping->host; - /* - * we don't want write_cache_pages to update - * nr_to_write and writeback_index - */ - no_nrwrite_index_update = wbc->no_nrwrite_index_update; - wbc->no_nrwrite_index_update = 1; pages_skipped = wbc->pages_skipped; retry: @@ -2941,7 +3011,7 @@ retry: if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " - "%ld pages, ino %lu; err %d\n", __func__, + "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); goto out_writepages; } @@ -2963,8 +3033,7 @@ retry: mpd.io_done = 0; mpd.pages_written = 0; mpd.retval = 0; - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, - &mpd); + ret = write_cache_pages_da(mapping, wbc, &mpd); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit @@ -3016,7 +3085,7 @@ retry: if (pages_skipped != wbc->pages_skipped) ext4_msg(inode->i_sb, KERN_CRIT, "This should not happen leaving %s " - "with nr_to_write = %ld ret = %d\n", + "with nr_to_write = %ld ret = %d", __func__, wbc->nr_to_write, ret); /* Update index */ @@ -3030,8 +3099,6 @@ retry: mapping->writeback_index = index; out_writepages: - if (!no_nrwrite_index_update) - wbc->no_nrwrite_index_update = 0; wbc->nr_to_write -= nr_to_writebump; wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); @@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - int ret, retries = 0, quota_retries = 0; + int ret, retries = 0; struct page *page; pgoff_t index; unsigned from, to; @@ -3135,22 +3202,6 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - - if ((ret == -EDQUOT) && - EXT4_I(inode)->i_reserved_meta_blocks && - (quota_retries++ < 3)) { - /* - * Since we often over-estimate the number of meta - * data blocks required, we may sometimes get a - * spurios out of quota error even though there would - * be enough space once we write the data blocks and - * find out how many meta data blocks were _really_ - * required. So try forcing the inode write to see if - * that helps. - */ - write_inode_now(inode, (quota_retries == 3)); - goto retry; - } out: return ret; } @@ -3546,46 +3597,18 @@ out: return ret; } +/* + * ext4_get_block used when preparing for a DIO write or buffer write. + * We allocate an uinitialized extent if blocks haven't been allocated. + * The extent will be converted to initialized after the IO is complete. + */ static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = ext4_journal_current_handle(); - int ret = 0; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - int dio_credits; - int started = 0; - ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", inode->i_ino, create); - /* - * ext4_get_block in prepare for a DIO write or buffer write. - * We allocate an uinitialized extent if blocks haven't been allocated. - * The extent will be converted to initialized after IO complete. - */ - create = EXT4_GET_BLOCKS_IO_CREATE_EXT; - - if (!handle) { - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); - handle = ext4_journal_start(inode, dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - started = 1; - } - - ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, - create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; - } - if (started) - ext4_journal_stop(handle); -out: - return ret; + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); } static void dump_completed_IO(struct inode * inode) @@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); @@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, count)) { - ext4_error(inode->i_sb, "inode #%lu: " - "attempt to clear blocks %llu len %lu, invalid", - inode->i_ino, (unsigned long long) block_to_free, - count); + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); return 1; } @@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else - ext4_error(inode->i_sb, - "circular indirect block detected, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long) this_bh->b_blocknr); + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); } } @@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), nr, 1)) { - ext4_error(inode->i_sb, - "indirect mapped block in inode " - "#%lu invalid (level %d, blk #%lu)", - inode->i_ino, depth, - (unsigned long) nr); + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); break; } @@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - ext4_error(inode->i_sb, - "Read failure, inode=%lu, block=%llu", - inode->i_ino, nr); + EXT4_ERROR_INODE(inode, + "Read failure block=%llu", + (unsigned long long) nr); continue; } @@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; - EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ext4_ext_truncate(inode); return; } @@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - ext4_error(sb, "unable to read inode block - " - "inode=%lu, block=%llu", inode->i_ino, block); + EXT4_ERROR_INODE(inode, "unable to read inode block - " + "block %llu", block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4884,8 +4904,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(sb, "unable to read inode block - inode=%lu," - " block=%llu", inode->i_ino, block); + EXT4_ERROR_INODE(inode, "unable to read inode " + "block %llu", block); brelse(bh); return -EIO; } @@ -5096,8 +5116,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - ext4_error(sb, "bad extended attribute block %llu inode #%lu", - ei->i_file_acl, inode->i_ino); + EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ei->i_file_acl); ret = -EIO; goto bad_inode; } else if (ei->i_flags & EXT4_EXTENTS_FL) { @@ -5142,8 +5162,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { ret = -EIO; - ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", - inode->i_mode, inode->i_ino); + EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); goto bad_inode; } brelse(iloc.bh); @@ -5381,9 +5400,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_error(inode->i_sb, "IO error syncing inode, " - "inode=%lu, block=%llu", inode->i_ino, - (unsigned long long)iloc.bh->b_blocknr); + EXT4_ERROR_INODE(inode, + "IO error syncing inode (block=%llu)", + (unsigned long long) iloc.bh->b_blocknr); err = -EIO; } brelse(iloc.bh); @@ -5455,7 +5474,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { @@ -5468,7 +5487,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && (attr->ia_size < inode->i_size || - (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { + (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5500,7 +5519,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } } /* ext4_truncate will clear the flag */ - if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) + if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) ext4_truncate(inode); } @@ -5576,7 +5595,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return ext4_indirect_trans_blocks(inode, nrblocks, chunk); return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } @@ -5911,9 +5930,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) */ if (val) - EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; + ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else - EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 016d024..bf5ae88 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -258,7 +258,7 @@ setversion_out: if (me.moved_len > 0) file_remove_suid(donor_filp); - if (copy_to_user((struct move_extent __user *)arg, + if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; mext_out: @@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC32_SETRSVSZ: cmd = EXT4_IOC_SETRSVSZ; break; - case EXT4_IOC_GROUP_ADD: + case EXT4_IOC32_GROUP_ADD: { + struct compat_ext4_new_group_input __user *uinput; + struct ext4_new_group_input input; + mm_segment_t old_fs; + int err; + + uinput = compat_ptr(arg); + err = get_user(input.group, &uinput->group); + err |= get_user(input.block_bitmap, &uinput->block_bitmap); + err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); + err |= get_user(input.inode_table, &uinput->inode_table); + err |= get_user(input.blocks_count, &uinput->blocks_count); + err |= get_user(input.reserved_blocks, + &uinput->reserved_blocks); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, + (unsigned long) &input); + set_fs(old_fs); + return err; + } + case EXT4_IOC_MOVE_EXT: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b423a36..12b3bc0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } +/* + * Cache the order of the largest free extent we have available in this block + * group. + */ +static void +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) +{ + int i; + int bits; + + grp->bb_largest_free_order = -1; /* uninit */ + + bits = sb->s_blocksize_bits + 1; + for (i = bits; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; + break; + } + } +} + static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) @@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, */ grp->bb_free = free; } + mb_set_largest_free_order(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); @@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 + * + * Locking note: This routine takes the block group lock of all groups + * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct page *page, char *incore) @@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore == NULL); mb_debug(1, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore != NULL); mb_debug(1, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); + trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); @@ -910,6 +937,11 @@ out: return err; } +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) { @@ -1004,6 +1036,11 @@ err: return ret; } +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) @@ -1150,7 +1187,7 @@ err: return ret; } -static void ext4_mb_release_desc(struct ext4_buddy *e4b) +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); @@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, buddy = buddy2; } while (1); } + mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) e4b->bd_info->bb_counters[ord]++; e4b->bd_info->bb_counters[ord]++; } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); @@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, } ext4_unlock_group(ac->ac_sb, group); - ext4_mb_release_desc(e4b); + ext4_mb_unload_buddy(e4b); return 0; } @@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } ext4_unlock_group(ac->ac_sb, group); - ext4_mb_release_desc(e4b); + ext4_mb_unload_buddy(e4b); return 0; } @@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } +/* This is now called BEFORE we load the buddy bitmap. */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { unsigned free, fragments; - unsigned i, bits; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); - BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + int ret = ext4_mb_init_group(ac->ac_sb, group); + if (ret) + return 0; + } free = grp->bb_free; fragments = grp->bb_fragments; @@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, case 0: BUG_ON(ac->ac_2order == 0); + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return 0; - bits = ac->ac_sb->s_blocksize_bits + 1; - for (i = ac->ac_2order; i <= bits; i++) - if (grp->bb_counters[i] > 0) - return 1; - break; + return 1; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) return 1; @@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); /* non-extent files are limited to low blocks/groups */ - if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); @@ -2024,15 +2068,11 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { - struct ext4_group_info *grp; - struct ext4_group_desc *desc; - if (group == ngroups) group = 0; - /* quick check to skip empty groups */ - grp = ext4_get_group_info(sb, group); - if (grp->bb_free == 0) + /* This now checks without needing the buddy page */ + if (!ext4_mb_good_group(ac, group, cr)) continue; err = ext4_mb_load_buddy(sb, group, &e4b); @@ -2040,15 +2080,18 @@ repeat: goto out; ext4_lock_group(sb, group); + + /* + * We need to check again after locking the + * block group + */ if (!ext4_mb_good_group(ac, group, cr)) { - /* someone did allocation from this group */ ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); continue; } ac->ac_groups_scanned++; - desc = ext4_get_group_desc(sb, group, NULL); if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && @@ -2058,7 +2101,7 @@ repeat: ext4_mb_complex_scan_group(ac, &e4b); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); if (ac->ac_status != AC_STATUS_CONTINUE) break; @@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) ext4_lock_group(sb, group); memcpy(&sg, ext4_get_group_info(sb, group), i); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); @@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ #ifdef DOUBLE_CHECK { @@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) entry->count, entry->group, entry); if (test_opt(sb, DISCARD)) { + int ret; ext4_fsblk_t discard_block; discard_block = entry->start_blk + @@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, entry->count); - sb_issue_discard(sb, discard_block, entry->count); + ret = sb_issue_discard(sb, discard_block, entry->count); + if (ret == EOPNOTSUPP) { + ext4_warning(sb, + "discard not supported, disabling"); + clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); + } } err = ext4_mb_load_buddy(sb, entry->group, &e4b); @@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) } ext4_unlock_group(sb, entry->group); kmem_cache_free(ext4_free_ext_cachep, entry); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); } mb_debug(1, "freed %u blocks in %u structures\n", count, count2); @@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void) void exit_ext4_mballoc(void) { - /* + /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ @@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); - if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && @@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) continue; /* non-extent files can't have physical blocks past 2^32 */ - if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) continue; @@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; - /* + /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ @@ -3697,7 +3747,7 @@ out: ext4_unlock_group(sb, group); if (ac) kmem_cache_free(ext4_ac_cachep, ac); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); return free; } @@ -3801,7 +3851,7 @@ repeat: if (bitmap_bh == NULL) { ext4_error(sb, "Error reading block bitmap for %u", group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); continue; } @@ -3810,7 +3860,7 @@ repeat: ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); @@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_mb_release_group_pa(&e4b, pa, ac); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } @@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!bh) tbh = sb_find_get_block(inode->i_sb, block + i); - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, tbh, block + i); } } - /* + /* * We need to make sure we don't reuse the freed block until * after the transaction is committed, which we can do by * treating the block as metadata, below. We make an @@ -4610,7 +4660,7 @@ do_more: atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); } - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); freed += count; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 34dcfc5..6f3a27e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) */ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_INCOMPAT_EXTENTS) || - (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d1fc662..3a6c92a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, int depth = ext_depth(orig_inode); int ret; + start_ext.ee_block = end_ext.ee_block = 0; o_start = o_end = oext = orig_path[depth].p_ext; oext_alen = ext4_ext_get_actual_len(oext); start_ext.ee_len = end_ext.ee_len = 0; @@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * new_ext |-------| */ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - ext4_error(orig_inode->i_sb, + EXT4_ERROR_INODE(orig_inode, "new_ext_end(%u) should be less than or equal to " "oext->ee_block(%u) + oext_alen(%d) - 1", new_ext_end, le32_to_cpu(oext->ee_block), @@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, while (1) { /* The extent for donor must be found. */ if (!dext) { - ext4_error(donor_inode->i_sb, + EXT4_ERROR_INODE(donor_inode, "The extent for donor must be found"); *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - ext4_error(donor_inode->i_sb, + EXT4_ERROR_INODE(donor_inode, "Donor offset(%u) and the first block of donor " "extent(%u) should be equal", donor_off, @@ -976,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode, } /* Ext4 move extent supports only extent based file */ - if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " "based file [ino:orig %lu]\n", orig_inode->i_ino); return -EOPNOTSUPP; - } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { + } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: donor file is not extents " "based file [ino:donor %lu]\n", donor_inode->i_ino); return -EOPNOTSUPP; @@ -1354,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (ret1 < 0) break; if (*moved_len > len) { - ext4_error(orig_inode->i_sb, + EXT4_ERROR_INODE(orig_inode, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0c070fa..a43e661 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) return blocksize; return (len & 65532) | ((len & 3) << 16); } - + __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) @@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) if (len == blocksize) { if (blocksize == 65536) return cpu_to_le16(EXT4_MAX_REC_LEN); - else + else return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); @@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, brelse(bh); } if (bcount) - printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", levels ? "" : " ", names, space/bcount, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; @@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int ret, err; __u32 hashval; - dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); dir = dir_file->f_path.dentry->d_inode; - if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { + if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += @@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode) { if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) - EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } /* @@ -943,8 +943,8 @@ restart: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ - ext4_error(sb, "reading directory #%lu offset %lu", - dir->i_ino, (unsigned long)block); + EXT4_ERROR_INODE(dir, "reading directory lblock %lu", + (unsigned long) block); brelse(bh); goto next; } @@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { - ext4_error(dir->i_sb, "bad inode number: %u", ino); + EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); if (unlikely(IS_ERR(inode))) { if (PTR_ERR(inode) == -ESTALE) { - ext4_error(dir->i_sb, - "deleted inode referenced: %u", - ino); + EXT4_ERROR_INODE(dir, + "deleted inode referenced: %u", + ino); return ERR_PTR(-EIO); } else { return ERR_CAST(inode); @@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child) brelse(bh); if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - ext4_error(child->d_inode->i_sb, - "bad inode number: %u", ino); + EXT4_ERROR_INODE(child->d_inode, + "bad parent inode number: %u", ino); return ERR_PTR(-EIO); } @@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, unsigned rec_len = 0; while (count--) { - struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); rec_len = EXT4_DIR_REC_LEN(de->name_len); memcpy (to, de, rec_len); @@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); if ((char *) de >= (((char *) root) + blocksize)) { - ext4_error(dir->i_sb, - "invalid rec_len for '..' in inode %lu", - dir->i_ino); + EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); brelse(bh); return -EIO; } @@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, brelse(bh); return retval; } - EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; + ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data1 = bh2->b_data; memcpy (data1, de, len); @@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, retval = ext4_dx_add_entry(handle, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) return retval; - EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; ext4_mark_inode_dirty(handle, dir); } @@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); + if (retval == 0) + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); return retval; } @@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode) if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { if (err) - ext4_error(inode->i_sb, - "error %d reading directory #%lu offset 0", - err, inode->i_ino); + EXT4_ERROR_INODE(inode, + "error %d reading directory lblock 0", err); else ext4_warning(inode->i_sb, "bad directory (dir #%lu) - no data block", @@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode) de = ext4_next_entry(de1, sb->s_blocksize); while (offset < inode->i_size) { if (!bh || - (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + unsigned int lblock; err = 0; brelse(bh); - bh = ext4_bread(NULL, inode, - offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); + lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); + bh = ext4_bread(NULL, inode, lblock, 0, &err); if (!bh) { if (err) - ext4_error(sb, - "error %d reading directory" - " #%lu offset %u", - err, inode->i_ino, offset); + EXT4_ERROR_INODE(inode, + "error %d reading directory " + "lblock %u", err, lblock); offset += sb->s_blocksize; continue; } @@ -2297,7 +2296,7 @@ retry: } } else { /* clear the extent format for fast symlink */ - EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); inode->i_op = &ext4_fast_symlink_inode_operations; memcpy((char *)&EXT4_I(inode)->i_data, symname, l); inode->i_size = l-1; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 5692c48..6df797e 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb)); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, input->group); atomic_add(input->free_blocks_count, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e14d22c..4e8983a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); + vfs_check_frozen(sb, SB_FREEZE_WRITE); /* Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ @@ -645,6 +646,8 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + flush_workqueue(sbi->dio_unwritten_wq); destroy_workqueue(sbi->dio_unwritten_wq); @@ -941,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) seq_puts(seq, ",journal_async_commit"); + else if (test_opt(sb, JOURNAL_CHECKSUM)) + seq_puts(seq, ",journal_checksum"); if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); if (test_opt(sb, I_VERSION)) @@ -1059,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount); + char *path); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1081,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = { static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk }; #endif @@ -2051,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) - vfs_quota_off(sb, i, 0); + dquot_quota_off(sb, i); } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -2213,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) struct ext4_attr { struct attribute attr; ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); - ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, + ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, const char *, size_t); int offset; }; @@ -2430,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) { + char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi; @@ -2793,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - } - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount3; - } - sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -2910,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); no_journal: + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount_wq; + } if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " @@ -3001,7 +3003,7 @@ no_journal: err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " - "zone (%d)\n", err); + "zone (%d)", err); goto failed_mount4; } @@ -3040,9 +3042,11 @@ no_journal: } else descr = "out journal"; - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s", descr, orig_data); lock_kernel(); + kfree(orig_data); return 0; cantfind_ext4: @@ -3059,6 +3063,10 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3066,10 +3074,6 @@ failed_mount3: else kfree(sbi->s_flex_groups); } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3089,6 +3093,7 @@ out_fail: kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); + kfree(orig_data); return ret; } @@ -3380,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!(sb->s_flags & MS_RDONLY)) es->s_wtime = cpu_to_le32(get_seconds()); es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - EXT4_SB(sb)->s_sectors_written_start) >> 1)); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( @@ -3485,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) + if (journal) { + vfs_check_frozen(sb, SB_FREEZE_WRITE); ret = ext4_journal_force_commit(journal); + } return ret; } @@ -3535,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb) * the journal. */ error = jbd2_journal_flush(journal); - if (error < 0) { - out: - jbd2_journal_unlock_updates(journal); - return error; - } + if (error < 0) + goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); - if (error) - goto out; - return 0; +out: + /* we rely on s_frozen to stop further updates */ + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + return error; } /* @@ -3563,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb) EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); unlock_super(sb); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return 0; } @@ -3574,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext4_mount_options old_opts; + int enable_quota = 0; ext4_group_t g; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err; #ifdef CONFIG_QUOTA int i; #endif + char *orig_data = kstrdup(data, GFP_KERNEL); lock_kernel(); @@ -3630,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount @@ -3698,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; + enable_quota = 1; } } ext4_setup_system_zone(sb); @@ -3713,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #endif unlock_super(sb); unlock_kernel(); + if (enable_quota) + dquot_resume(sb, -1); + + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); + kfree(orig_data); return 0; restore_opts: @@ -3734,6 +3750,7 @@ restore_opts: #endif unlock_super(sb); unlock_kernel(); + kfree(orig_data); return err; } @@ -3906,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type) */ static int ext4_quota_on_mount(struct super_block *sb, int type) { - return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], - EXT4_SB(sb)->s_jquota_fmt, type); + return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + EXT4_SB(sb)->s_jquota_fmt, type); } /* * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *name, int remount) + char *name) { int err; struct path path; if (!test_opt(sb, QUOTA)) return -EINVAL; - /* When remounting, no checks are needed and in fact, name is NULL */ - if (remount) - return vfs_quota_on(sb, type, format_id, name, remount); err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) @@ -3962,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, } } - err = vfs_quota_on_path(sb, type, format_id, &path); + err = dquot_quota_on_path(sb, type, format_id, &path); path_put(&path); return err; } @@ -4141,6 +4155,7 @@ static int __init init_ext4_fs(void) { int err; + ext4_check_flag_values(); err = init_ext4_system_zone(); if (err) return err; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 00740cb..ed9354a 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .setattr = ext4_setattr, #ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, + .setattr = ext4_setattr, #ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2de0e95..0433800 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { bad_block: - ext4_error(inode->i_sb, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext4_xattr_check_block(bs->bh)) { - ext4_error(sb, "inode %lu: bad block %llu", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -820,7 +818,7 @@ inserted: EXT4_I(inode)->i_block_group); /* non-extent files can't have physical blocks past 2^32 */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; block = ext4_new_meta_blocks(handle, inode, @@ -828,7 +826,7 @@ inserted: if (error) goto cleanup; - if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); ea_idebug(inode, "creating block %d", block); @@ -880,8 +878,8 @@ cleanup_dquot: goto cleanup; bad_block: - ext4_error(inode->i_sb, "inode %lu: bad block %llu", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); goto cleanup; #undef header @@ -1194,8 +1192,8 @@ retry: if (!bh) goto cleanup; if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, "inode %lu: bad block %llu", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) goto cleanup; bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { - ext4_error(inode->i_sb, "inode %lu: block %llu read error", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext4_error(inode->i_sb, "inode %lu: bad block %llu", - inode->i_ino, EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); goto cleanup; } ext4_xattr_release_block(handle, inode, bh); @@ -1504,9 +1502,8 @@ again: } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { - ext4_error(inode->i_sb, - "inode %lu: block %lu read error", - inode->i_ino, (unsigned long) ce->e_block); + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= EXT4_XATTR_REFCOUNT_MAX) { ea_idebug(inode, "block %lu refcount %d>=%d", diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 113f0a1..ae8200f 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) while (*fclus < cluster) { /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { - fat_fs_error(sb, "%s: detected the cluster chain loop" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + fat_fs_error_ratelimit(sb, + "%s: detected the cluster chain loop" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } @@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) if (nr < 0) goto out; else if (nr == FAT_ENT_FREE) { - fat_fs_error(sb, "%s: invalid cluster chain" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + fat_fs_error_ratelimit(sb, "%s: invalid cluster chain" + " (i_pos %lld)", __func__, + MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } else if (nr == FAT_ENT_EOF) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 530b4ca..ee42b9e 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -19,6 +19,7 @@ #include <linux/buffer_head.h> #include <linux/compat.h> #include <asm/uaccess.h> +#include <linux/kernel.h> #include "fat.h" /* @@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, { const wchar_t *ip; wchar_t ec; - unsigned char *op, nc; + unsigned char *op; int charlen; - int k; ip = uni; op = ascii; while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { ec = *ip++; - if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { + if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { op += charlen; len -= charlen; } else { if (uni_xlate == 1) { - *op = ':'; - for (k = 4; k > 0; k--) { - nc = ec & 0xF; - op[k] = nc > 9 ? nc + ('a' - 10) - : nc + '0'; - ec >>= 4; - } - op += 5; + *op++ = ':'; + op = pack_hex_byte(op, ec >> 8); + op = pack_hex_byte(op, ec); len -= 5; } else { *op++ = '?'; @@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp, return ret; } -static int fat_dir_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long fat_dir_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; int short_only, both; @@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp, both = 1; break; default: - return fat_generic_ioctl(inode, filp, cmd, arg); + return fat_generic_ioctl(filp, cmd, arg); } if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) @@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, both = 1; break; default: - return -ENOIOCTLCMD; + return fat_generic_ioctl(filp, cmd, (unsigned long)arg); } if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) @@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = fat_readdir, - .ioctl = fat_dir_ioctl, + .unlocked_ioctl = fat_dir_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, #endif diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e6efdfa..27ac257 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -6,6 +6,7 @@ #include <linux/nls.h> #include <linux/fs.h> #include <linux/mutex.h> +#include <linux/ratelimit.h> #include <linux/msdos_fs.h> /* @@ -82,6 +83,8 @@ struct msdos_sb_info { struct fatent_operations *fatent_ops; struct inode *fat_inode; + struct ratelimit_state ratelimit; + spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[FAT_HASH_SIZE]; }; @@ -298,16 +301,16 @@ extern int fat_free_clusters(struct inode *inode, int cluster); extern int fat_count_free_clusters(struct super_block *sb); /* fat/file.c */ -extern int fat_generic_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg); +extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; extern int fat_setattr(struct dentry * dentry, struct iattr * attr); -extern void fat_truncate(struct inode *inode); +extern int fat_setsize(struct inode *inode, loff_t offset); +extern void fat_truncate_blocks(struct inode *inode, loff_t offset); extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -extern int fat_file_fsync(struct file *file, struct dentry *dentry, - int datasync); +extern int fat_file_fsync(struct file *file, int datasync); /* fat/inode.c */ extern void fat_attach(struct inode *inode, loff_t i_pos); @@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent, extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ -extern void fat_fs_error(struct super_block *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))) __cold; +extern void +__fat_fs_error(struct super_block *s, int report, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))) __cold; +#define fat_fs_error(s, fmt, args...) \ + __fat_fs_error(s, 1, fmt , ## args) +#define fat_fs_error_ratelimit(s, fmt, args...) \ + __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, diff --git a/fs/fat/file.c b/fs/fat/file.c index e8c159d..990dfae 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -8,6 +8,7 @@ #include <linux/capability.h> #include <linux/module.h> +#include <linux/compat.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/buffer_head.h> @@ -114,9 +115,9 @@ out: return err; } -int fat_generic_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; u32 __user *user_attr = (u32 __user *)arg; switch (cmd) { @@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, } } +#ifdef CONFIG_COMPAT +static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) + +{ + return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && @@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp) return 0; } -int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) +int fat_file_fsync(struct file *filp, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; int res, err; - res = simple_fsync(filp, dentry, datasync); + res = generic_file_fsync(filp, datasync); err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); return res ? res : err; @@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .release = fat_file_release, - .ioctl = fat_generic_ioctl, + .unlocked_ioctl = fat_generic_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = fat_generic_compat_ioctl, +#endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, }; @@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip) return fat_free_clusters(inode, free_start); } -void fat_truncate(struct inode *inode) +void fat_truncate_blocks(struct inode *inode, loff_t offset) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); const unsigned int cluster_size = sbi->cluster_size; @@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode) * This protects against truncating a file bigger than it was then * trying to write into the hole. */ - if (MSDOS_I(inode)->mmu_private > inode->i_size) - MSDOS_I(inode)->mmu_private = inode->i_size; + if (MSDOS_I(inode)->mmu_private > offset) + MSDOS_I(inode)->mmu_private = offset; - nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; + nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits; fat_free(inode, nr_clusters); fat_flush_inodes(inode->i_sb, inode, NULL); @@ -351,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) return 0; } +int fat_setsize(struct inode *inode, loff_t offset) +{ + int error; + + error = simple_setsize(inode, offset); + if (error) + return error; + fat_truncate_blocks(inode, offset); + + return error; +} + #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) /* valid file mode bits */ #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) @@ -365,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) /* * Expand the file. Since inode_setattr() updates ->i_size * before calling the ->truncate(), but FAT needs to fill the - * hole before it. + * hole before it. XXX: this is no longer true with new truncate + * sequence. */ if (attr->ia_valid & ATTR_SIZE) { if (attr->ia_size > inode->i_size) { @@ -414,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; } - if (attr->ia_valid) - error = inode_setattr(inode, attr); + if (attr->ia_valid & ATTR_SIZE) { + error = fat_setsize(inode, attr->ia_size); + if (error) + goto out; + } + + generic_setattr(inode, attr); + mark_inode_dirty(inode); out: return error; } EXPORT_SYMBOL_GPL(fat_setattr); const struct inode_operations fat_file_inode_operations = { - .truncate = fat_truncate, .setattr = fat_setattr, .getattr = fat_getattr, }; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 0ce143b..7bf45ae 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, fat_get_block); } +static void fat_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, to, inode->i_size); + fat_truncate_blocks(inode, inode->i_size); + } +} + static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int err; + *pagep = NULL; - return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - fat_get_block, + err = cont_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); + if (err < 0) + fat_write_failed(mapping, pos + len); + return err; } static int fat_write_end(struct file *file, struct address_space *mapping, @@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; int err; err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + if (err < len) + fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; MSDOS_I(inode)->i_attrs |= ATTR_ARCH; @@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; if (rw == WRITE) { /* @@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, fat_get_block, NULL); + ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, fat_get_block, NULL); + if (ret < 0 && (rw & WRITE)) + fat_write_failed(mapping, offset + iov_length(iov, nr_segs)); + + return ret; } static sector_t _fat_bmap(struct address_space *mapping, sector_t block) @@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); inode->i_size = 0; - fat_truncate(inode); + fat_truncate_blocks(inode, 0); clear_inode(inode); } @@ -1250,6 +1273,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; sbi->dir_ops = fs_dir_inode_ops; + ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); error = parse_options(data, isvfat, silent, &debug, &sbi->options); if (error) @@ -1497,10 +1522,8 @@ out_fail: iput(fat_inode); if (root_inode) iput(root_inode); - if (sbi->nls_io) - unload_nls(sbi->nls_io); - if (sbi->nls_disk) - unload_nls(sbi->nls_disk); + unload_nls(sbi->nls_io); + unload_nls(sbi->nls_disk); if (sbi->options.iocharset != fat_default_iocharset) kfree(sbi->options.iocharset); sb->s_fs_info = NULL; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index d3da05f..1fa23f6 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -20,27 +20,29 @@ * In case the file system is remounted read-only, it can be made writable * again by remounting it. */ -void fat_fs_error(struct super_block *s, const char *fmt, ...) +void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...) { struct fat_mount_options *opts = &MSDOS_SB(s)->options; va_list args; - printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); + if (report) { + printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); - printk(KERN_ERR " "); - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - printk("\n"); + printk(KERN_ERR " "); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + printk("\n"); + } if (opts->errors == FAT_ERRORS_PANIC) - panic(" FAT fs panic from previous error\n"); + panic("FAT: fs panic from previous error\n"); else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { s->s_flags |= MS_RDONLY; - printk(KERN_ERR " File system has been set read-only\n"); + printk(KERN_ERR "FAT: Filesystem has been set read-only\n"); } } -EXPORT_SYMBOL_GPL(fat_fs_error); +EXPORT_SYMBOL_GPL(__fat_fs_error); /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ diff --git a/fs/file_table.c b/fs/file_table.c index 32d12b7..5c7d10e 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode, } EXPORT_SYMBOL(alloc_file); -void fput(struct file *file) -{ - if (atomic_long_dec_and_test(&file->f_count)) - __fput(file); -} - -EXPORT_SYMBOL(fput); - /** * drop_file_write_access - give up ability to write to a file * @file: the file to which we will stop writing @@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file) } EXPORT_SYMBOL_GPL(drop_file_write_access); -/* __fput is called from task context when aio completion releases the last - * last use of a struct file *. Do not use otherwise. +/* the real guts of fput() - releasing the last reference to file */ -void __fput(struct file *file) +static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; @@ -268,6 +259,14 @@ void __fput(struct file *file) mntput(mnt); } +void fput(struct file *file) +{ + if (atomic_long_dec_and_test(&file->f_count)) + __fput(file); +} + +EXPORT_SYMBOL(fput); + struct file *fget(unsigned int fd) { struct file *file; diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index aee049c..0ec7bb2 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = { }; const struct file_operations vxfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, .readdir = vxfs_readdir, }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 408a787..1d1088f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -398,11 +398,11 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - do { + while (inode->i_state & I_SYNC) { spin_unlock(&inode_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode_lock); - } while (inode->i_state & I_SYNC); + } } /* diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 1e1f286..4a8eb31 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) /* banners (can't represent line 0 by pos 0 as that would involve * returning a NULL pointer) */ if (pos == 0) - return (struct fscache_object *) ++(*_pos); + return (struct fscache_object *)(long)++(*_pos); if (pos < 3) return (struct fscache_object *)pos; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eb7e942..9424796 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -16,8 +16,12 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/slab.h> +#include <linux/pipe_fs_i.h> +#include <linux/swap.h> +#include <linux/splice.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); +MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; @@ -498,6 +502,9 @@ struct fuse_copy_state { int write; struct fuse_req *req; const struct iovec *iov; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; unsigned long nr_segs; unsigned long seglen; unsigned long addr; @@ -505,16 +512,16 @@ struct fuse_copy_state { void *mapaddr; void *buf; unsigned len; + unsigned move_pages:1; }; static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, - int write, struct fuse_req *req, + int write, const struct iovec *iov, unsigned long nr_segs) { memset(cs, 0, sizeof(*cs)); cs->fc = fc; cs->write = write; - cs->req = req; cs->iov = iov; cs->nr_segs = nr_segs; } @@ -522,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, /* Unmap and put previous page of userspace buffer */ static void fuse_copy_finish(struct fuse_copy_state *cs) { - if (cs->mapaddr) { + if (cs->currbuf) { + struct pipe_buffer *buf = cs->currbuf; + + if (!cs->write) { + buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + } else { + kunmap_atomic(cs->mapaddr, KM_USER0); + buf->len = PAGE_SIZE - cs->len; + } + cs->currbuf = NULL; + cs->mapaddr = NULL; + } else if (cs->mapaddr) { kunmap_atomic(cs->mapaddr, KM_USER0); if (cs->write) { flush_dcache_page(cs->pg); @@ -544,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); - if (!cs->seglen) { - BUG_ON(!cs->nr_segs); - cs->seglen = cs->iov[0].iov_len; - cs->addr = (unsigned long) cs->iov[0].iov_base; - cs->iov++; - cs->nr_segs--; + if (cs->pipebufs) { + struct pipe_buffer *buf = cs->pipebufs; + + if (!cs->write) { + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->len = buf->len; + cs->buf = cs->mapaddr + buf->offset; + cs->pipebufs++; + cs->nr_segs--; + } else { + struct page *page; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + buf->page = page; + buf->offset = 0; + buf->len = 0; + + cs->currbuf = buf; + cs->mapaddr = kmap_atomic(page, KM_USER0); + cs->buf = cs->mapaddr; + cs->len = PAGE_SIZE; + cs->pipebufs++; + cs->nr_segs++; + } + } else { + if (!cs->seglen) { + BUG_ON(!cs->nr_segs); + cs->seglen = cs->iov[0].iov_len; + cs->addr = (unsigned long) cs->iov[0].iov_base; + cs->iov++; + cs->nr_segs--; + } + err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + if (err < 0) + return err; + BUG_ON(err != 1); + offset = cs->addr % PAGE_SIZE; + cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); + cs->buf = cs->mapaddr + offset; + cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->seglen -= cs->len; + cs->addr += cs->len; } - down_read(¤t->mm->mmap_sem); - err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, - &cs->pg, NULL); - up_read(¤t->mm->mmap_sem); - if (err < 0) - return err; - BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); - cs->seglen -= cs->len; - cs->addr += cs->len; return lock_request(cs->fc, cs->req); } @@ -585,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) return ncpy; } +static int fuse_check_page(struct page *page) +{ + if (page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 1 || + (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + ~(1 << PG_locked | + 1 << PG_referenced | + 1 << PG_uptodate | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_reclaim))) { + printk(KERN_WARNING "fuse: trying to steal weird page\n"); + printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + return 1; + } + return 0; +} + +static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +{ + int err; + struct page *oldpage = *pagep; + struct page *newpage; + struct pipe_buffer *buf = cs->pipebufs; + struct address_space *mapping; + pgoff_t index; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + + if (cs->len != PAGE_SIZE) + goto out_fallback; + + if (buf->ops->steal(cs->pipe, buf) != 0) + goto out_fallback; + + newpage = buf->page; + + if (WARN_ON(!PageUptodate(newpage))) + return -EIO; + + ClearPageMappedToDisk(newpage); + + if (fuse_check_page(newpage) != 0) + goto out_fallback_unlock; + + mapping = oldpage->mapping; + index = oldpage->index; + + /* + * This is a new and locked page, it shouldn't be mapped or + * have any special flags on it + */ + if (WARN_ON(page_mapped(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(page_has_private(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageMlocked(oldpage))) + goto out_fallback_unlock; + + remove_from_page_cache(oldpage); + page_cache_release(oldpage); + + err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL); + if (err) { + printk(KERN_WARNING "fuse_try_move_page: failed to add page"); + goto out_fallback_unlock; + } + page_cache_get(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + + err = 0; + spin_lock(&cs->fc->lock); + if (cs->req->aborted) + err = -ENOENT; + else + *pagep = newpage; + spin_unlock(&cs->fc->lock); + + if (err) { + unlock_page(newpage); + page_cache_release(newpage); + return err; + } + + unlock_page(oldpage); + page_cache_release(oldpage); + cs->len = 0; + + return 0; + +out_fallback_unlock: + unlock_page(newpage); +out_fallback: + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->buf = cs->mapaddr + buf->offset; + + err = lock_request(cs->fc, cs->req); + if (err) + return err; + + return 1; +} + +static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, + unsigned offset, unsigned count) +{ + struct pipe_buffer *buf; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + buf = cs->pipebufs; + page_cache_get(page); + buf->page = page; + buf->offset = offset; + buf->len = count; + + cs->pipebufs++; + cs->nr_segs++; + cs->len = 0; + + return 0; +} + /* * Copy a page in the request to/from the userspace buffer. Must be * done atomically */ -static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, +static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, unsigned offset, unsigned count, int zeroing) { + int err; + struct page *page = *pagep; + if (page && zeroing && count < PAGE_SIZE) { void *mapaddr = kmap_atomic(page, KM_USER1); memset(mapaddr, 0, PAGE_SIZE); kunmap_atomic(mapaddr, KM_USER1); } while (count) { - if (!cs->len) { - int err = fuse_copy_fill(cs); - if (err) - return err; + if (cs->write && cs->pipebufs && page) { + return fuse_ref_page(cs, page, offset, count); + } else if (!cs->len) { + if (cs->move_pages && page && + offset == 0 && count == PAGE_SIZE) { + err = fuse_try_move_page(cs, pagep); + if (err <= 0) + return err; + } else { + err = fuse_copy_fill(cs); + if (err) + return err; + } } if (page) { void *mapaddr = kmap_atomic(page, KM_USER1); @@ -626,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { - struct page *page = req->pages[i]; - int err = fuse_copy_page(cs, page, offset, count, zeroing); + int err; + + err = fuse_copy_page(cs, &req->pages[i], offset, count, + zeroing); if (err) return err; @@ -704,11 +914,10 @@ __acquires(&fc->lock) * * Called with fc->lock held, releases it */ -static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, - const struct iovec *iov, unsigned long nr_segs) +static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes, struct fuse_req *req) __releases(&fc->lock) { - struct fuse_copy_state cs; struct fuse_in_header ih; struct fuse_interrupt_in arg; unsigned reqsize = sizeof(ih) + sizeof(arg); @@ -724,14 +933,13 @@ __releases(&fc->lock) arg.unique = req->in.h.unique; spin_unlock(&fc->lock); - if (iov_length(iov, nr_segs) < reqsize) + if (nbytes < reqsize) return -EINVAL; - fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); - err = fuse_copy_one(&cs, &ih, sizeof(ih)); + err = fuse_copy_one(cs, &ih, sizeof(ih)); if (!err) - err = fuse_copy_one(&cs, &arg, sizeof(arg)); - fuse_copy_finish(&cs); + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); return err ? err : reqsize; } @@ -745,18 +953,13 @@ __releases(&fc->lock) * request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ -static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, + struct fuse_copy_state *cs, size_t nbytes) { int err; struct fuse_req *req; struct fuse_in *in; - struct fuse_copy_state cs; unsigned reqsize; - struct file *file = iocb->ki_filp; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) - return -EPERM; restart: spin_lock(&fc->lock); @@ -776,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, if (!list_empty(&fc->interrupts)) { req = list_entry(fc->interrupts.next, struct fuse_req, intr_entry); - return fuse_read_interrupt(fc, req, iov, nr_segs); + return fuse_read_interrupt(fc, cs, nbytes, req); } req = list_entry(fc->pending.next, struct fuse_req, list); @@ -786,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, in = &req->in; reqsize = in->h.len; /* If request is too large, reply with an error and restart the read */ - if (iov_length(iov, nr_segs) < reqsize) { + if (nbytes < reqsize) { req->out.h.error = -EIO; /* SETXATTR is special, since it may contain too large data */ if (in->h.opcode == FUSE_SETXATTR) @@ -795,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, goto restart; } spin_unlock(&fc->lock); - fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); - err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); + cs->req = req; + err = fuse_copy_one(cs, &in->h, sizeof(in->h)); if (!err) - err = fuse_copy_args(&cs, in->numargs, in->argpages, + err = fuse_copy_args(cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); spin_lock(&fc->lock); req->locked = 0; if (req->aborted) { @@ -828,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return err; } +static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct file *file = iocb->ki_filp; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 1, iov, nr_segs); + + return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); +} + +static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = fuse_dev_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + int ret; + int page_nr = 0; + int do_wakeup = 0; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(in); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + fuse_copy_init(&cs, fc, 1, NULL, 0); + cs.pipebufs = bufs; + cs.pipe = pipe; + ret = fuse_dev_do_read(fc, in, &cs, len); + if (ret < 0) + goto out; + + ret = 0; + pipe_lock(pipe); + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + goto out_unlock; + } + + if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { + ret = -EIO; + goto out_unlock; + } + + while (page_nr < cs.nr_segs) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = bufs[page_nr].page; + buf->offset = bufs[page_nr].offset; + buf->len = bufs[page_nr].len; + buf->ops = &fuse_dev_pipe_buf_ops; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->inode) + do_wakeup = 1; + } + +out_unlock: + pipe_unlock(pipe); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + +out: + for (; page_nr < cs.nr_segs; page_nr++) + page_cache_release(bufs[page_nr].page); + + kfree(bufs); + return ret; +} + static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { @@ -987,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * it from the list and copy the rest of the buffer to the request. * The request is finished by calling request_end() */ -static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_dev_do_write(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) { int err; - size_t nbytes = iov_length(iov, nr_segs); struct fuse_req *req; struct fuse_out_header oh; - struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); - if (!fc) - return -EPERM; - fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; - err = fuse_copy_one(&cs, &oh, sizeof(oh)); + err = fuse_copy_one(cs, &oh, sizeof(oh)); if (err) goto err_finish; @@ -1016,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, * and error contains notification code. */ if (!oh.unique) { - err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); return err ? err : nbytes; } @@ -1035,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, if (req->aborted) { spin_unlock(&fc->lock); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; @@ -1052,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, queue_interrupt(fc, req); spin_unlock(&fc->lock); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); return nbytes; } @@ -1060,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, list_move(&req->list, &fc->io); req->out.h = oh; req->locked = 1; - cs.req = req; + cs->req = req; + if (!req->out.page_replace) + cs->move_pages = 0; spin_unlock(&fc->lock); - err = copy_out_args(&cs, &req->out, nbytes); - fuse_copy_finish(&cs); + err = copy_out_args(cs, &req->out, nbytes); + fuse_copy_finish(cs); spin_lock(&fc->lock); req->locked = 0; @@ -1080,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, err_unlock: spin_unlock(&fc->lock); err_finish: - fuse_copy_finish(&cs); + fuse_copy_finish(cs); return err; } +static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 0, iov, nr_segs); + + return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs)); +} + +static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + unsigned nbuf; + unsigned idx; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc; + size_t rem; + ssize_t ret; + + fc = fuse_get_conn(out); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + pipe_lock(pipe); + nbuf = 0; + rem = 0; + for (idx = 0; idx < pipe->nrbufs && rem < len; idx++) + rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; + + ret = -EINVAL; + if (rem < len) { + pipe_unlock(pipe); + goto out; + } + + rem = len; + while (rem) { + struct pipe_buffer *ibuf; + struct pipe_buffer *obuf; + + BUG_ON(nbuf >= pipe->buffers); + BUG_ON(!pipe->nrbufs); + ibuf = &pipe->bufs[pipe->curbuf]; + obuf = &bufs[nbuf]; + + if (rem >= ibuf->len) { + *obuf = *ibuf; + ibuf->ops = NULL; + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + } else { + ibuf->ops->get(pipe, ibuf); + *obuf = *ibuf; + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + obuf->len = rem; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + nbuf++; + rem -= obuf->len; + } + pipe_unlock(pipe); + + fuse_copy_init(&cs, fc, 0, NULL, nbuf); + cs.pipebufs = bufs; + cs.pipe = pipe; + + if (flags & SPLICE_F_MOVE) + cs.move_pages = 1; + + ret = fuse_dev_do_write(fc, &cs, len); + + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + buf->ops->release(pipe, buf); + } +out: + kfree(bufs); + return ret; +} + static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { unsigned mask = POLLOUT | POLLWRNORM; @@ -1225,8 +1619,10 @@ const struct file_operations fuse_dev_operations = { .llseek = no_llseek, .read = do_sync_read, .aio_read = fuse_dev_read, + .splice_read = fuse_dev_splice_read, .write = do_sync_write, .aio_write = fuse_dev_write, + .splice_write = fuse_dev_splice_write, .poll = fuse_dev_poll, .release = fuse_dev_release, .fasync = fuse_dev_fasync, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 4787ae6..3cdc5f7 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file) return 0; } -static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) +static int fuse_dir_fsync(struct file *file, int datasync) { - /* nfsd can call this with no file */ - return file ? fuse_fsync_common(file, de, datasync, 1) : 0; + return fuse_fsync_common(file, datasync, 1); } static bool update_mtime(unsigned ivalid) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a9f5e13..ada0ade 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode) fuse_release_nowrite(inode); } -int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, - int isdir) +int fuse_fsync_common(struct file *file, int datasync, int isdir) { - struct inode *inode = de->d_inode; + struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; struct fuse_req *req; @@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, return err; } -static int fuse_fsync(struct file *file, struct dentry *de, int datasync) +static int fuse_fsync(struct file *file, int datasync) { - return fuse_fsync_common(file, de, datasync, 0); + return fuse_fsync_common(file, datasync, 0); } void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, @@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) int i; size_t count = req->misc.read.in.size; size_t num_read = req->out.args[0].size; - struct inode *inode = req->pages[0]->mapping->host; + struct address_space *mapping = NULL; - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (!req->out.h.error && num_read < count) { - loff_t pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, req->misc.read.attr_ver); - } + for (i = 0; mapping == NULL && i < req->num_pages; i++) + mapping = req->pages[i]->mapping; - fuse_invalidate_attr(inode); /* atime changed */ + if (mapping) { + struct inode *inode = mapping->host; + + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!req->out.h.error && num_read < count) { + loff_t pos; + + pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, + req->misc.read.attr_ver); + } + fuse_invalidate_attr(inode); /* atime changed */ + } for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; @@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) else SetPageError(page); unlock_page(page); + page_cache_release(page); } if (req->ff) fuse_file_put(req->ff); @@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file) req->out.argpages = 1; req->out.page_zeroing = 1; + req->out.page_replace = 1; fuse_read_fill(req, file, pos, count, FUSE_READ); req->misc.read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { @@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page) return PTR_ERR(req); } } + page_cache_get(page); req->pages[req->num_pages] = page; req->num_pages++; return 0; @@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); - down_read(¤t->mm->mmap_sem); - npages = get_user_pages(current, current->mm, user_addr, npages, !write, - 0, req->pages, NULL); - up_read(¤t->mm->mmap_sem); + npages = get_user_pages_fast(user_addr, npages, !write, req->pages); if (npages < 0) return npages; @@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, while (iov_iter_count(&ii)) { struct page *page = pages[page_idx++]; size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); - void *kaddr, *map; + void *kaddr; - kaddr = map = kmap(page); + kaddr = kmap(page); while (todo) { char __user *uaddr = ii.iov->iov_base + ii.iov_offset; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 01cc462..8f309f0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -177,6 +177,9 @@ struct fuse_out { /** Zero partially or not copied pages */ unsigned page_zeroing:1; + /** Pages may be replaced with new ones */ + unsigned page_replace:1; + /** Number or arguments */ unsigned numargs; @@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode); /** * Send FSYNC or FSYNCDIR request */ -int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, - int isdir); +int fuse_fsync_common(struct file *file, int datasync, int isdir); /** * Notify poll wakeup diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 9fb76b0..48171f4 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int xtype) { struct inode *inode = dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct posix_acl *acl; int type; int error; + if (!sdp->sd_args.ar_posix_acl) + return -EOPNOTSUPP; + type = gfs2_acl_type(name); if (type < 0) return type; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index a739a0a..9f8b525 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -700,8 +700,14 @@ out: return 0; page_cache_release(page); + + /* + * XXX(hch): the call below should probably be replaced with + * a call to the gfs2-specific truncate blocks helper to actually + * release disk blocks.. + */ if (pos + len > ip->i_inode.i_size) - vmtruncate(&ip->i_inode, ip->i_inode.i_size); + simple_setsize(&ip->i_inode, ip->i_inode.i_size); out_endtrans: gfs2_trans_end(sdp); out_trans_fail: diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e6dd2ae..ed9a94f 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out_drop_write; + error = -EACCES; + if (!is_owner_or_cap(inode)) + goto out; + + error = 0; flags = ip->i_diskflags; new_flags = (flags & ~mask) | (reqflags & mask); if ((new_flags ^ flags) == 0) @@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) { struct inode *inode = filp->f_path.dentry->d_inode; u32 fsflags, gfsflags; + if (get_user(fsflags, ptr)) return -EFAULT; + gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); if (!S_ISDIR(inode->i_mode)) { if (gfsflags & GFS2_DIF_INHERIT_JDATA) @@ -547,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file) * Returns: errno */ -static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) +static int gfs2_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); int ret = 0; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 51d8061..b5612cb 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -242,34 +242,38 @@ fail: } /** - * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation + * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation + * and try to reclaim it by doing iput. + * + * This function assumes no rgrp locks are currently held. + * * @sb: The super block * no_addr: The inode number - * @@inode: A pointer to the inode found, if any * - * Returns: 0 and *inode if no errors occurred. If an error occurs, - * the resulting *inode may or may not be NULL. */ -int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, - struct inode **inode) +void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) { struct gfs2_sbd *sdp; struct gfs2_inode *ip; struct gfs2_glock *io_gl; int error; struct gfs2_holder gh; + struct inode *inode; - *inode = gfs2_iget_skip(sb, no_addr); + inode = gfs2_iget_skip(sb, no_addr); - if (!(*inode)) - return -ENOBUFS; + if (!inode) + return; - if (!((*inode)->i_state & I_NEW)) - return -ENOBUFS; + /* If it's not a new inode, someone's using it, so leave it alone. */ + if (!(inode->i_state & I_NEW)) { + iput(inode); + return; + } - ip = GFS2_I(*inode); - sdp = GFS2_SB(*inode); + ip = GFS2_I(inode); + sdp = GFS2_SB(inode); ip->i_no_formal_ino = -1; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); @@ -284,15 +288,13 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, &ip->i_iopen_gh); - if (unlikely(error)) { - if (error == GLR_TRYFAILED) - error = 0; + if (unlikely(error)) goto fail_iopen; - } + ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); - (*inode)->i_mode = DT2IF(DT_UNKNOWN); + inode->i_mode = DT2IF(DT_UNKNOWN); /* * We must read the inode in order to work out its type in @@ -303,16 +305,17 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, &gh); - if (unlikely(error)) { - if (error == GLR_TRYFAILED) - error = 0; + if (unlikely(error)) goto fail_glock; - } + /* Inode is now uptodate */ gfs2_glock_dq_uninit(&gh); - gfs2_set_iop(*inode); + gfs2_set_iop(inode); + + /* The iput will cause it to be deleted. */ + iput(inode); + return; - return 0; fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: @@ -321,7 +324,8 @@ fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); fail: - return error; + iget_failed(inode); + return; } static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index e161461..300ada3 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -84,8 +84,7 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino); -extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, - struct inode **inode); +extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index b593f0e..6a857e24 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -696,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) * */ -void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) { struct gfs2_ail *ai; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index eb570b4..0d007f9 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -47,28 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, sdp->sd_log_head = sdp->sd_log_tail = value; } -unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); -int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -void gfs2_log_incr_head(struct gfs2_sbd *sdp); +extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); -struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); -struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, +extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); +extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, struct buffer_head *real); -void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); -static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) -{ - if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) - __gfs2_log_flush(sbd, gl); -} - -void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); -void gfs2_remove_from_ail(struct gfs2_bufdata *bd); - -void gfs2_log_shutdown(struct gfs2_sbd *sdp); -void gfs2_meta_syncfs(struct gfs2_sbd *sdp); -int gfs2_logd(void *data); +extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); +extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); +extern int gfs2_logd(void *data); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 4e64352..98cdd05 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask) return error; } +/* + * XXX: should be changed to have proper ordering by opencoding simple_setsize + */ static int setattr_size(struct inode *inode, struct iattr *attr) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr) error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) return error; - error = vmtruncate(inode, attr->ia_size); + error = simple_setsize(inode, attr->ia_size); gfs2_trans_end(sdp); if (error) return error; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 117fa41..171a744 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1192,7 +1192,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; - struct inode *inode; int error = 0; u64 last_unlinked = NO_BLOCK, unlinked; @@ -1210,22 +1209,27 @@ try_again: if (error) return error; + /* Find an rgrp suitable for allocation. If it encounters any unlinked + dinodes along the way, error will equal -EAGAIN and unlinked will + contains it block address. We then need to look up that inode and + try to free it, and try the allocation again. */ error = get_local_rgrp(ip, &unlinked, &last_unlinked); if (error) { if (ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); if (error != -EAGAIN) return error; - error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb, - unlinked, &inode); - if (inode) - iput(inode); + + gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked); + /* regardless of whether or not gfs2_process_unlinked_inode + was successful, we don't want to repeat it again. */ + last_unlinked = unlinked; gfs2_log_flush(sdp, NULL); - if (error == GLR_TRYFAILED) - error = 0; + error = 0; + goto try_again; } - + /* no error, so we have the rgrp set in the inode's allocation. */ al->al_file = file; al->al_line = line; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 5f40236..764fd1b 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { const struct file_operations hfsplus_dir_operations = { .read = generic_read_dir, .readdir = hfsplus_readdir, - .ioctl = hfsplus_ioctl, + .unlocked_ioctl = hfsplus_ioctl, .llseek = generic_file_llseek, .release = hfsplus_dir_release, }; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 5c10d80..6505c30 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int); void hfsplus_delete_inode(struct inode *); /* ioctl.c */ -int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg); +long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); int hfsplus_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 1bcf597..9bbb829 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = { .fsync = file_fsync, .open = hfsplus_file_open, .release = hfsplus_file_release, - .ioctl = hfsplus_ioctl, + .unlocked_ioctl = hfsplus_ioctl, }; struct inode *hfsplus_new_inode(struct super_block *sb, int mode) diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index f457d2c..ac405f0 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -17,14 +17,16 @@ #include <linux/mount.h> #include <linux/sched.h> #include <linux/xattr.h> +#include <linux/smp_lock.h> #include <asm/uaccess.h> #include "hfsplus_fs.h" -int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, - unsigned long arg) +long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_path.dentry->d_inode; unsigned int flags; + lock_kernel(); switch (cmd) { case HFSPLUS_IOC_EXT2_GETFLAGS: flags = 0; @@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, case HFSPLUS_IOC_EXT2_SETFLAGS: { int err = 0; err = mnt_want_write(filp->f_path.mnt); - if (err) + if (err) { + unlock_kernel(); return err; + } if (!is_owner_or_cap(inode)) { err = -EACCES; @@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, mark_inode_dirty(inode); setflags_out: mnt_drop_write(filp->f_path.mnt); + unlock_kernel(); return err; } default: + unlock_kernel(); return -ENOTTY; } } diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 3a029d8..87ac189 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file) return 0; } -int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int hostfs_fsync(struct file *file, int datasync) { - return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync); + return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync); } static const struct file_operations hostfs_file_fops = { diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 3efabff..a9ae9bf 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file) return 0; } -int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) +int hpfs_file_fsync(struct file *file, int datasync) { - /*return file_fsync(file, dentry);*/ + /*return file_fsync(file, datasync);*/ return 0; /* Don't fsync :-) */ } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 97bf738..75f9d43 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *, /* file.c */ -int hpfs_file_fsync(struct file *, struct dentry *, int); +int hpfs_file_fsync(struct file *, int); extern const struct file_operations hpfs_file_ops; extern const struct inode_operations hpfs_file_iops; extern const struct address_space_operations hpfs_aops; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 2e4dfa8..826c3f9 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) return err; } -static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) +static int hppfs_fsync(struct file *file, int datasync) { return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a0bbd3d..a4e9a7e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -688,7 +688,7 @@ static void init_once(void *foo) const struct file_operations hugetlbfs_file_operations = { .read = hugetlbfs_read, .mmap = hugetlbfs_file_mmap, - .fsync = simple_sync_file, + .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, }; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index b9ab69b..e0aca9a 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp, const struct file_operations isofs_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = isofs_readdir, }; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index bfc70f5..e214d68 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; current->journal_info = NULL; - spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); transaction->t_outstanding_credits -= handle->h_buffer_credits; transaction->t_updates--; @@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle) jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ - __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + jbd2_log_start_commit(journal, transaction->t_tid); /* * Special case: JBD2_SYNC synchronous updates require us @@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle) err = jbd2_log_wait_commit(journal, tid); } else { spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); } lock_map_release(&handle->h_lockdep_map); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index e7291c1..8134970 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, struct page **pagep, void **fsdata); static int jffs2_readpage (struct file *filp, struct page *pg); -int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync) +int jffs2_fsync(struct file *filp, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); /* Trigger GC to flush any pending writes for this inode */ diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 86e0821..8bc2c80 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) mutex_unlock(&f->sem); jffs2_complete_reservation(c); - /* We have to do the vmtruncate() without f->sem held, since + /* We have to do the simple_setsize() without f->sem held, since some pages may be locked and waiting for it in readpage(). We are protected from a simultaneous write() extending i_size back past iattr->ia_size, because do_truncate() holds the generic inode semaphore. */ if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { - vmtruncate(inode, iattr->ia_size); + simple_setsize(inode, iattr->ia_size); inode->i_blocks = (inode->i_size + 511) >> 9; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 035a767..4791aac 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations; extern const struct file_operations jffs2_file_operations; extern const struct inode_operations jffs2_file_inode_operations; extern const struct address_space_operations jffs2_file_address_operations; -int jffs2_fsync(struct file *, struct dentry *, int); +int jffs2_fsync(struct file *, int); int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); /* ioctl.c */ diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 85d9ec6..127263c 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -27,9 +27,9 @@ #include "jfs_acl.h" #include "jfs_debug.h" -int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int jfs_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; int rc = 0; if (!(inode->i_state & I_DIRTY) || diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 9e6bda3..11042b1 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -21,7 +21,7 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); -extern int jfs_fsync(struct file *, struct dentry *, int); +extern int jfs_fsync(struct file *, int); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index b66832a..b38f96b 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -179,6 +179,8 @@ static void jfs_put_super(struct super_block *sb) jfs_info("In jfs_put_super"); + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + lock_kernel(); rc = jfs_umount(sb); @@ -396,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) JFS_SBI(sb)->flag = flag; ret = jfs_mount_rw(sb, 1); + + /* mark the fs r/w for quota activity */ + sb->s_flags &= ~MS_RDONLY; + unlock_kernel(); + dquot_resume(sb, -1); return ret; } if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { + rc = dquot_suspend(sb, -1); + if (rc < 0) { + unlock_kernel(); + return rc; + } rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; unlock_kernel(); @@ -469,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) */ sb->s_op = &jfs_super_operations; sb->s_export_op = &jfs_export_operations; +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &dquot_quotactl_ops; +#endif /* * Initialize direct-mapping inode/address-space @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/mount.h> #include <linux/vfs.h> +#include <linux/quotaops.h> #include <linux/mutex.h> #include <linux/exportfs.h> #include <linux/writeback.h> @@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na return NULL; } -int simple_sync_file(struct file * file, struct dentry *dentry, int datasync) -{ - return 0; -} - int dcache_dir_open(struct inode *inode, struct file *file) { static struct qstr cursor_name = {.len = 1, .name = "."}; @@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = { .llseek = dcache_dir_lseek, .read = generic_read_dir, .readdir = dcache_readdir, - .fsync = simple_sync_file, + .fsync = noop_fsync, }; const struct inode_operations simple_dir_inode_operations = { @@ -330,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; } +/** + * simple_setsize - handle core mm and vfs requirements for file size change + * @inode: inode + * @newsize: new file size + * + * Returns 0 on success, -error on failure. + * + * simple_setsize must be called with inode_mutex held. + * + * simple_setsize will check that the requested new size is OK (see + * inode_newsize_ok), and then will perform the necessary i_size update + * and pagecache truncation (if necessary). It will be typically be called + * from the filesystem's setattr function when ATTR_SIZE is passed in. + * + * The inode itself must have correct permissions and attributes to allow + * i_size to be changed, this function then just checks that the new size + * requested is valid. + * + * In the case of simple in-memory filesystems with inodes stored solely + * in the inode cache, and file data in the pagecache, nothing more needs + * to be done to satisfy a truncate request. Filesystems with on-disk + * blocks for example will need to free them in the case of truncate, in + * that case it may be easier not to use simple_setsize (but each of its + * components will likely be required at some point to update pagecache + * and inode etc). + */ +int simple_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, newsize); + if (error) + return error; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + truncate_pagecache(inode, oldsize, newsize); + + return error; +} +EXPORT_SYMBOL(simple_setsize); + +/** + * simple_setattr - setattr for simple in-memory filesystem + * @dentry: dentry + * @iattr: iattr structure + * + * Returns 0 on success, -error on failure. + * + * simple_setattr implements setattr for an in-memory filesystem which + * does not store its own file data or metadata (eg. uses the page cache + * and inode cache as its data store). + */ +int simple_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) { + error = simple_setsize(inode, iattr->ia_size); + if (error) + return error; + } + + generic_setattr(inode, iattr); + + return error; +} +EXPORT_SYMBOL(simple_setattr); + int simple_readpage(struct file *file, struct page *page) { clear_highpage(page); @@ -851,13 +922,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, } EXPORT_SYMBOL_GPL(generic_fh_to_parent); -int simple_fsync(struct file *file, struct dentry *dentry, int datasync) +/** + * generic_file_fsync - generic fsync implementation for simple filesystems + * @file: file to synchronize + * @datasync: only synchronize essential metadata if true + * + * This is a generic implementation of the fsync method for simple + * filesystems which track all non-inode metadata in the buffers list + * hanging off the address_space structure. + */ +int generic_file_fsync(struct file *file, int datasync) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, /* metadata-only; caller takes care of data */ }; - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; int err; int ret; @@ -872,7 +952,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync) ret = err; return ret; } -EXPORT_SYMBOL(simple_fsync); +EXPORT_SYMBOL(generic_file_fsync); + +/* + * No-op implementation of ->fsync for in-memory filesystems. + */ +int noop_fsync(struct file *file, int datasync) +{ + return 0; +} EXPORT_SYMBOL(dcache_dir_close); EXPORT_SYMBOL(dcache_dir_lseek); @@ -895,7 +983,7 @@ EXPORT_SYMBOL(simple_release_fs); EXPORT_SYMBOL(simple_rename); EXPORT_SYMBOL(simple_rmdir); EXPORT_SYMBOL(simple_statfs); -EXPORT_SYMBOL(simple_sync_file); +EXPORT_SYMBOL(noop_fsync); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); EXPORT_SYMBOL(simple_write_to_buffer); diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 0de5240..abe1caf 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, } } -int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int logfs_fsync(struct file *file, int datasync) { - struct super_block *sb = dentry->d_inode->i_sb; + struct super_block *sb = file->f_mapping->host->i_sb; logfs_write_anchor(sb); return 0; diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 1a9db84..c838c4d 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops; int logfs_readpage(struct file *file, struct page *page); int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); -int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); +int logfs_fsync(struct file *file, int datasync); /* gc.c */ u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 6198731..9196958 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = minix_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) @@ -72,11 +72,8 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) { + if (!IS_ERR(page)) kmap(page); - if (!PageUptodate(page)) - goto fail; - } return page; fail: diff --git a/fs/minix/file.c b/fs/minix/file.c index 3eec3e6..d5320ff 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index f230109..13487ad 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode) return (block_t *)minix_i(inode)->u.i2_data; } +#define DIRCOUNT 7 +#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2)) + static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; @@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) printk("MINIX-fs: block_to_path: " "block %ld too big on dev %s\n", block, bdevname(sb->s_bdev, b)); - } else if (block < 7) { + } else if (block < DIRCOUNT) { offsets[n++] = block; - } else if ((block -= 7) < 256) { - offsets[n++] = 7; + } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) { + offsets[n++] = DIRCOUNT; offsets[n++] = block; - } else if ((block -= 256) < 256*256) { - offsets[n++] = 8; - offsets[n++] = block>>8; - offsets[n++] = block & 255; + } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) { + offsets[n++] = DIRCOUNT + 1; + offsets[n++] = block / INDIRCOUNT(sb); + offsets[n++] = block % INDIRCOUNT(sb); } else { - block -= 256*256; - offsets[n++] = 9; - offsets[n++] = block>>16; - offsets[n++] = (block>>8) & 255; - offsets[n++] = block & 255; + block -= INDIRCOUNT(sb) * INDIRCOUNT(sb); + offsets[n++] = DIRCOUNT + 2; + offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb); + offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb); + offsets[n++] = block % INDIRCOUNT(sb); } return n; } @@ -1621,6 +1621,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, case LAST_DOTDOT: follow_dotdot(nd); dir = nd->path.dentry; + case LAST_DOT: if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { if (!dir->d_op->d_revalidate(dir, nd)) { error = -ESTALE; @@ -1628,7 +1629,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } } /* fallthrough */ - case LAST_DOT: case LAST_ROOT: if (open_flag & O_CREAT) goto exit; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 7edfcd4..9578cbe 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -49,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *); const struct file_operations ncp_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = ncp_readdir, - .ioctl = ncp_ioctl, + .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, #endif diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 1daabb9..3639cc5 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -22,7 +22,7 @@ #include <linux/ncp_fs.h> #include "ncplib_kernel.h" -static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync) +static int ncp_fsync(struct file *file, int datasync) { return 0; } @@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations = .llseek = ncp_remote_llseek, .read = ncp_file_read, .write = ncp_file_write, - .ioctl = ncp_ioctl, + .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, #endif diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 60a5e28..023c03d 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -20,6 +20,7 @@ #include <linux/smp_lock.h> #include <linux/vmalloc.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include <linux/ncp_fs.h> @@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) } #endif /* CONFIG_NCPFS_NLS */ -static int __ncp_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_dentry->d_inode; struct ncp_server *server = NCP_SERVER(inode); int result; struct ncp_ioctl_request request; @@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd) } } -int ncp_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - int ret; + long ret; + lock_kernel(); if (ncp_ioctl_need_write(cmd)) { /* * inside the ioctl(), any failures which @@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp, * -EACCESS, so it seems consistent to keep * that here. */ - if (mnt_want_write(filp->f_path.mnt)) - return -EACCES; + if (mnt_want_write(filp->f_path.mnt)) { + ret = -EACCES; + goto out; + } } - ret = __ncp_ioctl(inode, filp, cmd, arg); + ret = __ncp_ioctl(filp, cmd, arg); if (ncp_ioctl_need_write(cmd)) mnt_drop_write(filp->f_path.mnt); + +out: + unlock_kernel(); return ret; } #ifdef CONFIG_COMPAT long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int ret; + long ret; lock_kernel(); arg = (unsigned long) compat_ptr(arg); - ret = ncp_ioctl(inode, file, cmd, arg); + ret = ncp_ioctl(file, cmd, arg); unlock_kernel(); return ret; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ee9a179..782b431 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *); static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -static int nfs_fsync_dir(struct file *, struct dentry *, int); +static int nfs_fsync_dir(struct file *, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); const struct file_operations nfs_dir_operations = { @@ -641,8 +641,10 @@ out: * All directory operations under NFS are synchronous, so fsync() * is a dummy operation. */ -static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) +static int nfs_fsync_dir(struct file *filp, int datasync) { + struct dentry *dentry = filp->f_path.dentry; + dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); @@ -1741,6 +1743,7 @@ remove_lru_entry: clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); smp_mb__after_clear_bit(); } + spin_unlock(&inode->i_lock); } spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cac96bc..36a5e74 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, unsigned long nr_segs, loff_t pos); static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); +static int nfs_file_fsync(struct file *, int datasync); static int nfs_check_flags(int flags); static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); @@ -322,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) * whether any write errors occurred for this process. */ static int -nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) +nfs_file_fsync(struct file *file, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2f8b1157..04214fc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; rc = strict_strtoul(string, 10, &option); kfree(string); - if (rc != 0 || option > USHORT_MAX) + if (rc != 0 || option > USHRT_MAX) goto out_invalid_value; mnt->nfs_server.port = option; break; @@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; rc = strict_strtoul(string, 10, &option); kfree(string); - if (rc != 0 || option > USHORT_MAX) + if (rc != 0 || option > USHRT_MAX) goto out_invalid_value; mnt->mount_server.port = option; break; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3aea3ca..91679e2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how) int res = 0; if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) - goto out; + goto out_mark_dirty; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); @@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how) wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, nfs_wait_bit_killable, TASK_KILLABLE); + else + goto out_mark_dirty; } else nfs_commit_clear_lock(NFS_I(inode)); -out: + return res; + /* Note: If we exit without ensuring that the commit is complete, + * we must mark the inode as dirty. Otherwise, future calls to + * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure + * that the data is on the disk. + */ +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return res; } @@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page) }; int ret; - while(PagePrivate(page)) { + for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; + continue; } - ret = sync_inode(inode, &wbc); + if (!PagePrivate(page)) + break; + ret = nfs_commit_inode(inode, FLUSH_SYNC); if (ret < 0) goto out_error; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc3194e..508941c 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf) if (sscanf(buf, "%15s %4u", transport, &port) != 2) return -EINVAL; - if (port < 1 || port > USHORT_MAX) + if (port < 1 || port > USHRT_MAX) return -EINVAL; err = nfsd_create_serv(); @@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf) if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) return -EINVAL; - if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) + if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL) return -EINVAL; xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 30292df..c9a30d7 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -27,7 +27,7 @@ #include "nilfs.h" #include "segment.h" -int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +int nilfs_sync_file(struct file *file, int datasync) { /* * Called from fsync() system call @@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) * This function should be implemented when the writeback function * will be implemented. */ - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; int err; if (!nilfs_inode_dirty(inode)) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 8723e5b..47d6d79 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, struct page *, struct inode *); /* file.c */ -extern int nilfs_sync_file(struct file *, struct dentry *, int); +extern int nilfs_sync_file(struct file *, int); /* ioctl.c */ long nilfs_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index fe44d3f..0f48e7c 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) * this problem for now. We do write the $BITMAP attribute if it is present * which is the important one for a directory so things are not too bad. */ -static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int ntfs_dir_fsync(struct file *filp, int datasync) { - struct inode *bmp_vi, *vi = dentry->d_inode; + struct inode *bmp_vi, *vi = filp->f_mapping->host; int err, ret; ntfs_attr na; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 8804f09..113ebd9 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * the page at all. For a more detailed explanation see ntfs_truncate() in * fs/ntfs/inode.c. * - * @cached_page and @lru_pvec are just optimizations for dealing with multiple - * pages. - * * Return 0 on success and -errno on error. In the case that an error is * encountered it is possible that the initialized size will already have been * incremented some way towards @new_init_size but it is guaranteed that if @@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be * held by the caller. */ -static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, - struct page **cached_page, struct pagevec *lru_pvec) +static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) { s64 old_init_size; loff_t old_i_size; @@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, * Obtain @nr_pages locked page cache pages from the mapping @mapping and * starting at index @index. * - * If a page is newly created, increment its refcount and add it to the - * caller's lru-buffering pagevec @lru_pvec. - * - * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages - * are obtained at once instead of just one page and that 0 is returned on - * success and -errno on error. + * If a page is newly created, add it to lru list * * Note, the page locks are obtained in ascending page index order. */ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, pgoff_t index, const unsigned nr_pages, struct page **pages, - struct page **cached_page, struct pagevec *lru_pvec) + struct page **cached_page) { int err, nr; @@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, goto err_out; } } - err = add_to_page_cache(*cached_page, mapping, index, + err = add_to_page_cache_lru(*cached_page, mapping, index, GFP_KERNEL); if (unlikely(err)) { if (err == -EEXIST) @@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, goto err_out; } pages[nr] = *cached_page; - page_cache_get(*cached_page); - if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ssize_t status, written; unsigned nr_pages; int err; - struct pagevec lru_pvec; ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " "pos 0x%llx, count 0x%lx.", @@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, } } } - pagevec_init(&lru_pvec, 0); written = 0; /* * If the write starts beyond the initialized size, extend it up to the @@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ll = ni->initialized_size; read_unlock_irqrestore(&ni->size_lock, flags); if (pos > ll) { - err = ntfs_attr_extend_initialized(ni, pos, &cached_page, - &lru_pvec); + err = ntfs_attr_extend_initialized(ni, pos); if (err < 0) { ntfs_error(vol->sb, "Cannot perform write to inode " "0x%lx, attribute type 0x%x, because " @@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); /* Get and lock @do_pages starting at index @start_idx. */ status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, - pages, &cached_page, &lru_pvec); + pages, &cached_page); if (unlikely(status)) break; /* @@ -2077,7 +2062,6 @@ err_out: *ppos = pos; if (cached_page) page_cache_release(cached_page); - pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); @@ -2149,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, /** * ntfs_file_fsync - sync a file to disk * @filp: file to be synced - * @dentry: dentry describing the file to sync * @datasync: if non-zero only flush user data and not metadata * * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync @@ -2165,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, * Also, if @datasync is true, we do not wait on the inode to be written out * but we always wait on the page cache pages to be written out. * - * Note: In the past @filp could be NULL so we ignore it as we don't need it - * anyway. - * * Locking: Caller must hold i_mutex on the inode. * * TODO: We should probably also write all attribute/index inodes associated * with this inode but since we have no simple way of getting to them we ignore * this problem for now. */ -static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int ntfs_file_fsync(struct file *filp, int datasync) { - struct inode *vi = dentry->d_inode; + struct inode *vi = filp->f_mapping->host; int err, ret = 0; ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index b7428c5..ec6d123 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize, * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no * larger than 16 bits. */ - BUG_ON(ecc > USHORT_MAX); + BUG_ON(ecc > USHRT_MAX); bc->bc_crc32e = cpu_to_le32(crc); bc->bc_ecc = cpu_to_le16((u16)ecc); @@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no * larger than 16 bits. */ - BUG_ON(ecc > USHORT_MAX); + BUG_ON(ecc > USHRT_MAX); bc->bc_crc32e = cpu_to_le32(crc); bc->bc_ecc = cpu_to_le16((u16)ecc); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 97e54b9..6a13ea6 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file) return 0; } -static int ocfs2_sync_file(struct file *file, - struct dentry *dentry, - int datasync) +static int ocfs2_sync_file(struct file *file, int datasync) { int err = 0; journal_t *journal; - struct inode *inode = dentry->d_inode; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, @@ -1053,7 +1052,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } /* - * This will intentionally not wind up calling vmtruncate(), + * This will intentionally not wind up calling simple_setsize(), * since all the work for a size change has been done above. * Otherwise, we could get into problems with truncate as * ip_alloc_sem is used there to protect against i_size @@ -2119,9 +2118,13 @@ relock: * direct write may have instantiated a few * blocks outside i_size. Trim these off again. * Don't need i_size_read because we hold i_mutex. + * + * XXX(hch): this looks buggy because ocfs2 did not + * actually implement ->truncate. Take a look at + * the new truncate sequence and update this accordingly */ if (*ppos + count > inode->i_size) - vmtruncate(inode, inode->i_size); + simple_setsize(inode, inode->i_size); ret = written; goto out_dio; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2c26ce2..0eaa929 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -879,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; if (unsuspend) - status = vfs_quota_enable( - sb_dqopt(sb)->files[type], - type, QFMT_OCFS2, - DQUOT_SUSPENDED); - else - status = vfs_quota_disable(sb, type, - DQUOT_SUSPENDED); + status = dquot_resume(sb, type); + else { + struct ocfs2_mem_dqinfo *oinfo; + + /* Cancel periodic syncing before suspending */ + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + status = dquot_suspend(sb, type); + } if (status < 0) break; } @@ -916,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb) status = -ENOENT; goto out_quota_off; } - status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, - DQUOT_USAGE_ENABLED); + status = dquot_enable(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); if (status < 0) goto out_quota_off; } @@ -952,8 +954,8 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* Turn off quotas. This will remove all dquot structures from * memory and so they will be automatically synced to global * quota files */ - vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | - DQUOT_LIMITS_ENABLED); + dquot_disable(sb, type, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); if (!inode) continue; iput(inode); @@ -962,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* Handle quota on quotactl */ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, - char *path, int remount) + char *path) { unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; @@ -970,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) return -EINVAL; - if (remount) - return 0; /* Just ignore it has been handled in - * ocfs2_remount() */ - return vfs_quota_enable(sb_dqopt(sb)->files[type], type, - format_id, DQUOT_LIMITS_ENABLED); + return dquot_enable(sb_dqopt(sb)->files[type], type, + format_id, DQUOT_LIMITS_ENABLED); } /* Handle quota off quotactl */ -static int ocfs2_quota_off(struct super_block *sb, int type, int remount) +static int ocfs2_quota_off(struct super_block *sb, int type) { - if (remount) - return 0; /* Ignore now and handle later in - * ocfs2_remount() */ - return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); + return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); } static const struct quotactl_ops ocfs2_quotactl_ops = { .quota_on = ocfs2_quota_on, .quota_off = ocfs2_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, }; static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 399487c..6e7a329 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = { .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 3ceca05..648c9d8 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/stringify.h> +#include <linux/kernel.h> #include "ldm.h" #include "check.h" #include "msdos.h" @@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src) int h; /* high part */ - if ((x = src[0] - '0') <= '9'-'0') h = x; - else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; - else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; - else return -1; - h <<= 4; + x = h = hex_to_bin(src[0]); + if (h < 0) + return -1; /* low part */ - if ((x = src[1] - '0') <= '9'-'0') return h | x; - if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); - if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); - return -1; + h = hex_to_bin(src[1]); + if (h < 0) + return -1; + + return (x << 4) + h; } /** @@ -230,6 +230,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe, return kmap(buf->page); } +EXPORT_SYMBOL(generic_pipe_buf_map); /** * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer @@ -249,6 +250,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, } else kunmap(buf->page); } +EXPORT_SYMBOL(generic_pipe_buf_unmap); /** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer @@ -279,6 +281,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe, return 1; } +EXPORT_SYMBOL(generic_pipe_buf_steal); /** * generic_pipe_buf_get - get a reference to a &struct pipe_buffer @@ -294,6 +297,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { page_cache_get(buf->page); } +EXPORT_SYMBOL(generic_pipe_buf_get); /** * generic_pipe_buf_confirm - verify contents of the pipe buffer @@ -309,6 +313,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info, { return 0; } +EXPORT_SYMBOL(generic_pipe_buf_confirm); /** * generic_pipe_buf_release - put a reference to a &struct pipe_buffer @@ -323,6 +328,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe, { page_cache_release(buf->page); } +EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, @@ -1172,16 +1178,20 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = roundup_pow_of_two(nr_pages); - if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) - return -EPERM; + if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) { + ret = -EPERM; + goto out; + } /* * The pipe needs to be at least 2 pages large to * guarantee POSIX behaviour. */ - if (nr_pages < 2) - return -EINVAL; - ret = pipe_set_size(pipe, nr_pages); + if (arg < 2) { + ret = -EINVAL; + goto out; + } + ret = pipe_set_size(pipe, arg); break; } case F_GETPIPE_SZ: @@ -1192,6 +1202,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) break; } +out: mutex_unlock(&pipe->inode->i_mutex); return ret; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 885ab55..9b58d38 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) shpending = p->signal->shared_pending.signal; blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); - num_threads = atomic_read(&p->signal->count); + num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ qsize = atomic_read(&__task_cred(p)->user->sigpending); rcu_read_unlock(); @@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, tty_nr = new_encode_dev(tty_devnum(sig->tty)); } - num_threads = atomic_read(&sig->count); + num_threads = get_nr_threads(task); collect_sigign_sigcatch(task, &sigign, &sigcatch); cmin_flt = sig->cmin_flt; diff --git a/fs/proc/base.c b/fs/proc/base.c index c7f9f23..acb7ef8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root) return result; } -static int get_nr_threads(struct task_struct *tsk) -{ - unsigned long flags; - int count = 0; - - if (lock_task_sighand(tsk, &flags)) { - count = atomic_read(&tsk->signal->count); - unlock_task_sighand(tsk, &flags); - } - return count; -} - static int proc_cwd_link(struct inode *inode, struct path *path) { struct task_struct *task = get_proc_task(inode); @@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir, const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-EINVAL); + struct dentry *error; /* Allocate the inode */ error = ERR_PTR(-ENOMEM); @@ -2794,7 +2782,7 @@ out: struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { - struct dentry *result = ERR_PTR(-ENOENT); + struct dentry *result; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 43c1274..2791907 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ /* * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. - * - * Current inode allocations in the proc-fs (hex-numbers): - * - * 00000000 reserved - * 00000001-00000fff static entries (goners) - * 001 root-ino - * - * 00001000-00001fff unused - * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff - * 80000000-efffffff unused - * f0000000-ffffffff dynamic entries - * - * Goal: - * Once we split the thing into several virtual filesystems, - * we will get rid of magical ranges (and this comment, BTW). */ static unsigned int get_inode_number(void) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index c837a77..6f37c39 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -588,7 +588,7 @@ static struct kcore_list kcore_text; */ static void __init proc_kcore_text_init(void) { - kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); + kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT); } #else static void __init proc_kcore_text_init(void) diff --git a/fs/proc/root.c b/fs/proc/root.c index 757c069..4258384 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -110,7 +110,6 @@ void __init proc_root_init(void) if (err) return; proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); - err = PTR_ERR(proc_mnt); if (IS_ERR(proc_mnt)) { unregister_filesystem(&proc_fs_type); return; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47f5b14..aea1d3f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } +#ifdef CONFIG_HUGETLB_PAGE static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) { u64 pme = 0; @@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, return err; } +#endif /* HUGETLB_PAGE */ /* * /proc/pid/pagemap - an array mapping virtual pages to pfns @@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; +#ifdef CONFIG_HUGETLB_PAGE pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; +#endif pagemap_walk.mm = mm; pagemap_walk.private = ± diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 6f30c3d..6e8fc62 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -77,9 +77,10 @@ out: const struct file_operations qnx4_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = qnx4_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; const struct inode_operations qnx4_dir_inode_operations = diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 655a4c5..12c233d 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -228,10 +228,6 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); -#ifdef CONFIG_SMP -struct dqstats *dqstats_pcpu; -EXPORT_SYMBOL(dqstats_pcpu); -#endif static qsize_t inode_get_rsv_space(struct inode *inode); static void __dquot_initialize(struct inode *inode, int type); @@ -584,7 +580,7 @@ out: } EXPORT_SYMBOL(dquot_scan_active); -int vfs_quota_sync(struct super_block *sb, int type, int wait) +int dquot_quota_sync(struct super_block *sb, int type, int wait) { struct list_head *dirty; struct dquot *dquot; @@ -656,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait) return 0; } -EXPORT_SYMBOL(vfs_quota_sync); +EXPORT_SYMBOL(dquot_quota_sync); /* Free unused dquots from cache */ static void prune_dqcache(int count) @@ -676,27 +672,10 @@ static void prune_dqcache(int count) } } -static int dqstats_read(unsigned int type) -{ - int count = 0; -#ifdef CONFIG_SMP - int cpu; - for_each_possible_cpu(cpu) - count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type]; - /* Statistics reading is racy, but absolute accuracy isn't required */ - if (count < 0) - count = 0; -#else - count = dqstats.stat[type]; -#endif - return count; -} - /* * This is called from kswapd when we think we need some * more memory */ - static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) { if (nr) { @@ -704,7 +683,9 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) prune_dqcache(nr); spin_unlock(&dq_list_lock); } - return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure; + return ((unsigned) + percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]) + /100) * sysctl_vfs_cache_pressure; } static struct shrinker dqcache_shrinker = { @@ -1514,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) /* * This operation can block, but only after everything is updated */ -int __dquot_alloc_space(struct inode *inode, qsize_t number, - int warn, int reserve) +int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0; char warntype[MAXQUOTAS]; + int warn = flags & DQUOT_SPACE_WARN; + int reserve = flags & DQUOT_SPACE_RESERVE; + int nofail = flags & DQUOT_SPACE_NOFAIL; /* * First test before acquiring mutex - solves deadlocks when we @@ -1539,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, continue; ret = check_bdq(inode->i_dquot[cnt], number, !warn, warntype+cnt); - if (ret) { + if (ret && !nofail) { spin_unlock(&dq_data_lock); goto out_flush_warn; } @@ -1638,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * This operation can block, but only after everything is updated */ -void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) +void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; char warntype[MAXQUOTAS]; + int reserve = flags & DQUOT_SPACE_RESERVE; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1812,7 +1796,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) - transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA); + transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA); ret = __dquot_transfer(inode, transfer_to); dqput_all(transfer_to); @@ -1847,6 +1831,7 @@ const struct dquot_operations dquot_operations = { .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, }; +EXPORT_SYMBOL(dquot_operations); /* * Generic helper for ->open on filesystems supporting disk quotas. @@ -1865,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open); /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ -int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) +int dquot_disable(struct super_block *sb, int type, unsigned int flags) { int cnt, ret = 0; struct quota_info *dqopt = sb_dqopt(sb); @@ -1995,14 +1980,15 @@ put_inodes: } return ret; } -EXPORT_SYMBOL(vfs_quota_disable); +EXPORT_SYMBOL(dquot_disable); -int vfs_quota_off(struct super_block *sb, int type, int remount) +int dquot_quota_off(struct super_block *sb, int type) { - return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : - (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); + return dquot_disable(sb, type, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); } -EXPORT_SYMBOL(vfs_quota_off); +EXPORT_SYMBOL(dquot_quota_off); + /* * Turn quotas on on a device */ @@ -2120,36 +2106,43 @@ out_fmt: } /* Reenable quotas on remount RW */ -static int vfs_quota_on_remount(struct super_block *sb, int type) +int dquot_resume(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); struct inode *inode; - int ret; + int ret = 0, cnt; unsigned int flags; - mutex_lock(&dqopt->dqonoff_mutex); - if (!sb_has_quota_suspended(sb, type)) { + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + + mutex_lock(&dqopt->dqonoff_mutex); + if (!sb_has_quota_suspended(sb, cnt)) { + mutex_unlock(&dqopt->dqonoff_mutex); + continue; + } + inode = dqopt->files[cnt]; + dqopt->files[cnt] = NULL; + spin_lock(&dq_state_lock); + flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, + cnt); + dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt); + spin_unlock(&dq_state_lock); mutex_unlock(&dqopt->dqonoff_mutex); - return 0; - } - inode = dqopt->files[type]; - dqopt->files[type] = NULL; - spin_lock(&dq_state_lock); - flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | - DQUOT_LIMITS_ENABLED, type); - dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type); - spin_unlock(&dq_state_lock); - mutex_unlock(&dqopt->dqonoff_mutex); - flags = dquot_generic_flag(flags, type); - ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, - flags); - iput(inode); + flags = dquot_generic_flag(flags, cnt); + ret = vfs_load_quota_inode(inode, cnt, + dqopt->info[cnt].dqi_fmt_id, flags); + iput(inode); + } return ret; } +EXPORT_SYMBOL(dquot_resume); -int vfs_quota_on_path(struct super_block *sb, int type, int format_id, +int dquot_quota_on_path(struct super_block *sb, int type, int format_id, struct path *path) { int error = security_quota_on(path->dentry); @@ -2164,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id, DQUOT_LIMITS_ENABLED); return error; } -EXPORT_SYMBOL(vfs_quota_on_path); +EXPORT_SYMBOL(dquot_quota_on_path); -int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, - int remount) +int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name) { struct path path; int error; - if (remount) - return vfs_quota_on_remount(sb, type); - error = kern_path(name, LOOKUP_FOLLOW, &path); if (!error) { - error = vfs_quota_on_path(sb, type, format_id, &path); + error = dquot_quota_on_path(sb, type, format_id, &path); path_put(&path); } return error; } -EXPORT_SYMBOL(vfs_quota_on); +EXPORT_SYMBOL(dquot_quota_on); /* * More powerful function for turning on quotas allowing setting * of individual quota flags */ -int vfs_quota_enable(struct inode *inode, int type, int format_id, - unsigned int flags) +int dquot_enable(struct inode *inode, int type, int format_id, + unsigned int flags) { int ret = 0; struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); /* Just unsuspend quotas? */ - if (flags & DQUOT_SUSPENDED) - return vfs_quota_on_remount(sb, type); + BUG_ON(flags & DQUOT_SUSPENDED); + if (!flags) return 0; /* Just updating flags needed? */ @@ -2229,13 +2218,13 @@ out_lock: load_quota: return vfs_load_quota_inode(inode, type, format_id, flags); } -EXPORT_SYMBOL(vfs_quota_enable); +EXPORT_SYMBOL(dquot_enable); /* * This function is used when filesystem needs to initialize quotas * during mount time. */ -int vfs_quota_on_mount(struct super_block *sb, char *qf_name, +int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type) { struct dentry *dentry; @@ -2261,24 +2250,7 @@ out: dput(dentry); return error; } -EXPORT_SYMBOL(vfs_quota_on_mount); - -/* Wrapper to turn on quotas when remounting rw */ -int vfs_dq_quota_on_remount(struct super_block *sb) -{ - int cnt; - int ret = 0, err; - - if (!sb->s_qcop || !sb->s_qcop->quota_on) - return -ENOSYS; - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1); - if (err < 0 && !ret) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(vfs_dq_quota_on_remount); +EXPORT_SYMBOL(dquot_quota_on_mount); static inline qsize_t qbtos(qsize_t blocks) { @@ -2313,8 +2285,8 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) spin_unlock(&dq_data_lock); } -int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, - struct fs_disk_quota *di) +int dquot_get_dqblk(struct super_block *sb, int type, qid_t id, + struct fs_disk_quota *di) { struct dquot *dquot; @@ -2326,7 +2298,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, return 0; } -EXPORT_SYMBOL(vfs_get_dqblk); +EXPORT_SYMBOL(dquot_get_dqblk); #define VFS_FS_DQ_MASK \ (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ @@ -2425,7 +2397,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) return 0; } -int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, +int dquot_set_dqblk(struct super_block *sb, int type, qid_t id, struct fs_disk_quota *di) { struct dquot *dquot; @@ -2441,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, out: return rc; } -EXPORT_SYMBOL(vfs_set_dqblk); +EXPORT_SYMBOL(dquot_set_dqblk); /* Generic routine for getting common part of quota file information */ -int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) +int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; @@ -2463,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return 0; } -EXPORT_SYMBOL(vfs_get_dqinfo); +EXPORT_SYMBOL(dquot_get_dqinfo); /* Generic routine for setting common part of quota file information */ -int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) +int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; int err = 0; @@ -2493,27 +2465,27 @@ out: mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return err; } -EXPORT_SYMBOL(vfs_set_dqinfo); +EXPORT_SYMBOL(dquot_set_dqinfo); -const struct quotactl_ops vfs_quotactl_ops = { - .quota_on = vfs_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk +const struct quotactl_ops dquot_quotactl_ops = { + .quota_on = dquot_quota_on, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk }; - +EXPORT_SYMBOL(dquot_quotactl_ops); static int do_proc_dqstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { -#ifdef CONFIG_SMP - /* Update global table */ unsigned int type = (int *)table->data - dqstats.stat; - dqstats.stat[type] = dqstats_read(type); -#endif + + /* Update global table */ + dqstats.stat[type] = + percpu_counter_sum_positive(&dqstats.counter[type]); return proc_dointvec(table, write, buffer, lenp, ppos); } @@ -2606,7 +2578,7 @@ static ctl_table sys_table[] = { static int __init dquot_init(void) { - int i; + int i, ret; unsigned long nr_hash, order; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); @@ -2624,12 +2596,11 @@ static int __init dquot_init(void) if (!dquot_hash) panic("Cannot create dquot hash table"); -#ifdef CONFIG_SMP - dqstats_pcpu = alloc_percpu(struct dqstats); - if (!dqstats_pcpu) - panic("Cannot create dquot stats table"); -#endif - memset(&dqstats, 0, sizeof(struct dqstats)); + for (i = 0; i < _DQST_DQSTAT_LAST; i++) { + ret = percpu_counter_init(&dqstats.counter[i], 0); + if (ret) + panic("Cannot create dquot stat counters"); + } /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index ce3dfd0..b299961 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -73,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, if (IS_ERR(pathname)) return PTR_ERR(pathname); if (sb->s_qcop->quota_on) - ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); + ret = sb->s_qcop->quota_on(sb, type, id, pathname); putname(pathname); return ret; } @@ -260,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_QUOTAOFF: if (!sb->s_qcop->quota_off) return -ENOSYS; - return sb->s_qcop->quota_off(sb, type, 0); + return sb->s_qcop->quota_off(sb, type); case Q_GETFMT: return quota_getfmt(sb, type, addr); case Q_GETINFO: diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 78f613c..4884ac5 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, }; const struct inode_operations ramfs_file_inode_operations = { + .setattr = simple_setattr, .getattr = simple_getattr, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5ea4ad8..d532c20 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = { .aio_read = generic_file_aio_read, .write = do_sync_write, .aio_write = generic_file_aio_write, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .llseek = generic_file_llseek, @@ -146,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) return ret; } - ret = vmtruncate(inode, newsize); + ret = simple_setsize(inode, newsize); return ret; } @@ -169,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) /* pick out size-changing events */ if (ia->ia_valid & ATTR_SIZE) { - loff_t size = i_size_read(inode); + loff_t size = inode->i_size; + if (ia->ia_size != size) { ret = ramfs_nommu_resize(inode, ia->ia_size, size); if (ret < 0 || ia->ia_valid == ATTR_SIZE) @@ -182,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) } } - ret = inode_setattr(inode, ia); + generic_setattr(inode, ia); out: ia->ia_valid = old_ia_valid; return ret; diff --git a/fs/read_write.c b/fs/read_write.c index 113386d..9c04852 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) } EXPORT_SYMBOL(generic_file_llseek); +/** + * noop_llseek - No Operation Performed llseek implementation + * @file: file structure to seek on + * @offset: file offset to seek to + * @origin: type of seek + * + * This is an implementation of ->llseek useable for the rare special case when + * userspace expects the seek to succeed but the (device) file is actually not + * able to perform the seek. In this case you use noop_llseek() instead of + * falling back to the default implementation of ->llseek. + */ +loff_t noop_llseek(struct file *file, loff_t offset, int origin) +{ + return file->f_pos; +} +EXPORT_SYMBOL(noop_llseek); + loff_t no_llseek(struct file *file, loff_t offset, int origin) { return -ESPIPE; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 0793044..198dabf 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -14,10 +14,10 @@ extern const struct reiserfs_key MIN_KEY; static int reiserfs_readdir(struct file *, void *, filldir_t); -static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, - int datasync); +static int reiserfs_dir_fsync(struct file *filp, int datasync); const struct file_operations reiserfs_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = reiserfs_readdir, .fsync = reiserfs_dir_fsync, @@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = { #endif }; -static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, - int datasync) +static int reiserfs_dir_fsync(struct file *filp, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; int err; reiserfs_write_lock(inode->i_sb); err = reiserfs_commit_for_inode(inode); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 9977df9..b82cdd8 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode) * be removed... */ -static int reiserfs_sync_file(struct file *filp, - struct dentry *dentry, int datasync) +static int reiserfs_sync_file(struct file *filp, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; int err; int barrier_done; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 59125fb..9822fa1 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA int i; int ms_active_set; + int quota_enabled[MAXQUOTAS]; #endif /* compose key to look for "save" links */ @@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s) } /* Turn on quotas so that they are updated correctly */ for (i = 0; i < MAXQUOTAS; i++) { + quota_enabled[i] = 1; if (REISERFS_SB(s)->s_qf_names[i]) { - int ret = reiserfs_quota_on_mount(s, i); + int ret; + + if (sb_has_quota_active(s, i)) { + quota_enabled[i] = 0; + continue; + } + ret = reiserfs_quota_on_mount(s, i); if (ret < 0) reiserfs_warning(s, "reiserfs-2500", "cannot turn on journaled " @@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { - if (sb_dqopt(s)->files[i]) - vfs_quota_off(s, i, 0); + if (sb_dqopt(s)->files[i] && quota_enabled[i]) + dquot_quota_off(s, i); } if (ms_active_set) /* Restore the flag back */ @@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s) struct reiserfs_transaction_handle th; th.t_trans_id = 0; + dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + reiserfs_write_lock(s); if (s->s_dirt) @@ -620,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *); static int reiserfs_release_dquot(struct dquot *); static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); -static int reiserfs_quota_on(struct super_block *, int, int, char *, int); +static int reiserfs_quota_on(struct super_block *, int, int, char *); static const struct dquot_operations reiserfs_quota_operations = { .write_dquot = reiserfs_write_dquot, @@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = { static const struct quotactl_ops reiserfs_qctl_operations = { .quota_on = reiserfs_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk, + .quota_off = dquot_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, }; #endif @@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (s->s_flags & MS_RDONLY) /* it is read-only already */ goto out_ok; + + err = dquot_suspend(s, -1); + if (err < 0) + goto out_err; + /* try to remount file system with read-only permissions */ if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { @@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) s->s_dirt = 0; if (!(*mount_flags & MS_RDONLY)) { + dquot_resume(s, -1); finish_unfinished(s); reiserfs_xattr_init(s, *mount_flags); } @@ -2022,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type) */ static int reiserfs_quota_on_mount(struct super_block *sb, int type) { - return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], - REISERFS_SB(sb)->s_jquota_fmt, type); + return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], + REISERFS_SB(sb)->s_jquota_fmt, type); } /* * Standard function to be called on quota_on */ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - char *name, int remount) + char *name) { int err; struct path path; @@ -2039,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) return -EINVAL; - /* No more checks needed? Path and format_id are bogus anyway... */ - if (remount) - return vfs_quota_on(sb, type, format_id, name, 1); + err = kern_path(name, LOOKUP_FOLLOW, &path); if (err) return err; @@ -2085,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, if (err) goto out; } - err = vfs_quota_on_path(sb, type, format_id, &path); + err = dquot_quota_on_path(sb, type, format_id, &path); out: path_put(&path); return err; diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 3e4803b..00a70ca 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *); const struct file_operations smb_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = smb_readdir, - .ioctl = smb_ioctl, + .unlocked_ioctl = smb_ioctl, .open = smb_dir_open, }; diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index dbf6548..8e187a0 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -28,8 +28,9 @@ #include "proto.h" static int -smb_fsync(struct file *file, struct dentry * dentry, int datasync) +smb_fsync(struct file *file, int datasync) { + struct dentry *dentry = file->f_path.dentry; struct smb_sb_info *server = server_from_dentry(dentry); int result; @@ -437,7 +438,7 @@ const struct file_operations smb_file_operations = .aio_read = smb_file_aio_read, .write = do_sync_write, .aio_write = smb_file_aio_write, - .ioctl = smb_ioctl, + .unlocked_ioctl = smb_ioctl, .mmap = smb_file_mmap, .open = smb_file_open, .release = smb_file_release, diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index dfa1d67..9551cb6 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -714,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr) error = server->ops->truncate(inode, attr->ia_size); if (error) goto out; - error = vmtruncate(inode, attr->ia_size); + error = simple_setsize(inode, attr->ia_size); if (error) goto out; refresh = 1; diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c index dbae1f8..0721531 100644 --- a/fs/smbfs/ioctl.c +++ b/fs/smbfs/ioctl.c @@ -13,6 +13,7 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/highuid.h> +#include <linux/smp_lock.h> #include <linux/net.h> #include <linux/smb_fs.h> @@ -22,14 +23,14 @@ #include "proto.h" -int -smb_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +long +smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - struct smb_sb_info *server = server_from_inode(inode); + struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode); struct smb_conn_opt opt; int result = -EINVAL; + lock_kernel(); switch (cmd) { uid16_t uid16; uid_t uid32; @@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp, default: break; } + unlock_kernel(); return result; } diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h index 03f456c..05939a6 100644 --- a/fs/smbfs/proto.h +++ b/fs/smbfs/proto.h @@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops; extern const struct file_operations smb_file_operations; extern const struct inode_operations smb_file_inode_operations; /* ioctl.c */ -extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); +extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* smbiod.c */ extern void smbiod_wake_up(void); extern int smbiod_register_server(struct smb_sb_info *server); diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c index 54350b5..00b2909 100644 --- a/fs/smbfs/symlink.c +++ b/fs/smbfs/symlink.c @@ -15,7 +15,6 @@ #include <linux/pagemap.h> #include <linux/net.h> #include <linux/namei.h> -#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 25a00d1..cc6ce8a 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -26,6 +26,17 @@ config SQUASHFS If unsure, say N. +config SQUASHFS_XATTRS + bool "Squashfs XATTR support" + depends on SQUASHFS + default n + help + Saying Y here includes support for extended attributes (xattrs). + Xattrs are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page). + + If unsure, say N. + config SQUASHFS_EMBEDDED bool "Additional option for memory-constrained systems" diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index df8a19e..2cee3e9 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o +squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o + diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 49daaf6..62e63ad 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -40,11 +40,13 @@ #include <linux/fs.h> #include <linux/vfs.h> +#include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "xattr.h" /* * Initialise VFS inode with the base inode information common to all @@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) int err, type, offset = SQUASHFS_INODE_OFFSET(ino); union squashfs_inode squashfs_ino; struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; + int xattr_id = SQUASHFS_INVALID_XATTR; TRACE("Entered squashfs_read_inode\n"); @@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino) frag_offset = 0; } + xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); inode->i_size = le64_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - @@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; + xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; @@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &squashfs_symlink_inode_ops; inode->i_data.a_ops = &squashfs_symlink_aops; inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; + if (type == SQUASHFS_LSYMLINK_TYPE) { + __le32 xattr; + + err = squashfs_read_metadata(sb, NULL, &block, + &offset, inode->i_size); + if (err < 0) + goto failed_read; + err = squashfs_read_metadata(sb, &xattr, &block, + &offset, sizeof(xattr)); + if (err < 0) + goto failed_read; + xattr_id = le32_to_cpu(xattr); + } + TRACE("Symbolic link inode %x:%x, start_block %llx, offset " "%x\n", SQUASHFS_INODE_BLK(ino), offset, block, offset); break; } case SQUASHFS_BLKDEV_TYPE: - case SQUASHFS_CHRDEV_TYPE: - case SQUASHFS_LBLKDEV_TYPE: - case SQUASHFS_LCHRDEV_TYPE: { + case SQUASHFS_CHRDEV_TYPE: { struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; unsigned int rdev; @@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino) SQUASHFS_INODE_BLK(ino), offset, rdev); break; } + case SQUASHFS_LBLKDEV_TYPE: + case SQUASHFS_LCHRDEV_TYPE: { + struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev; + unsigned int rdev; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_LCHRDEV_TYPE) + inode->i_mode |= S_IFCHR; + else + inode->i_mode |= S_IFBLK; + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_op = &squashfs_inode_ops; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + rdev = le32_to_cpu(sqsh_ino->rdev); + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + + TRACE("Device inode %x:%x, rdev %x\n", + SQUASHFS_INODE_BLK(ino), offset, rdev); + break; + } case SQUASHFS_FIFO_TYPE: - case SQUASHFS_SOCKET_TYPE: - case SQUASHFS_LFIFO_TYPE: - case SQUASHFS_LSOCKET_TYPE: { + case SQUASHFS_SOCKET_TYPE: { struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, @@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino) init_special_inode(inode, inode->i_mode, 0); break; } + case SQUASHFS_LFIFO_TYPE: + case SQUASHFS_LSOCKET_TYPE: { + struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_LFIFO_TYPE) + inode->i_mode |= S_IFIFO; + else + inode->i_mode |= S_IFSOCK; + xattr_id = le32_to_cpu(sqsh_ino->xattr); + inode->i_op = &squashfs_inode_ops; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + init_special_inode(inode, inode->i_mode, 0); + break; + } default: ERROR("Unknown inode type %d in squashfs_iget!\n", type); return -EINVAL; } + if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) { + err = squashfs_xattr_lookup(sb, xattr_id, + &squashfs_i(inode)->xattr_count, + &squashfs_i(inode)->xattr_size, + &squashfs_i(inode)->xattr); + if (err < 0) + goto failed_read; + inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9) + + 1; + } else + squashfs_i(inode)->xattr_count = 0; + return 0; failed_read: ERROR("Unable to read inode 0x%llx\n", ino); return err; } + + +const struct inode_operations squashfs_inode_ops = { + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; + diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 5266bd8..7a9464d 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -57,11 +57,13 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/dcache.h> +#include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "xattr.h" /* * Lookup name in the directory index, returning the location of the metadata @@ -237,5 +239,7 @@ failed: const struct inode_operations squashfs_dir_inode_ops = { - .lookup = squashfs_lookup + .lookup = squashfs_lookup, + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr }; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index fe2587a..733a17c 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long, unsigned int); extern int squashfs_read_inode(struct inode *, long long); +/* xattr.c */ +extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t); + /* - * Inodes, files and decompressor operations + * Inodes, files, decompressor and xattr operations */ /* dir.c */ @@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops; /* file.c */ extern const struct address_space_operations squashfs_aops; +/* inode.c */ +extern const struct inode_operations squashfs_inode_ops; + /* namei.c */ extern const struct inode_operations squashfs_dir_inode_ops; /* symlink.c */ extern const struct address_space_operations squashfs_symlink_aops; +extern const struct inode_operations squashfs_symlink_inode_ops; + +/* xattr.c */ +extern const struct xattr_handler *squashfs_xattr_handlers[]; /* zlib_wrapper.c */ extern const struct squashfs_decompressor squashfs_zlib_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 7902424..8eabb80 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -46,6 +46,7 @@ #define SQUASHFS_NAME_LEN 256 #define SQUASHFS_INVALID_FRAG (0xffffffffU) +#define SQUASHFS_INVALID_XATTR (0xffffffffU) #define SQUASHFS_INVALID_BLK (-1LL) /* Filesystem flags */ @@ -96,6 +97,13 @@ #define SQUASHFS_LFIFO_TYPE 13 #define SQUASHFS_LSOCKET_TYPE 14 +/* Xattr types */ +#define SQUASHFS_XATTR_USER 0 +#define SQUASHFS_XATTR_TRUSTED 1 +#define SQUASHFS_XATTR_SECURITY 2 +#define SQUASHFS_XATTR_VALUE_OOL 256 +#define SQUASHFS_XATTR_PREFIX_MASK 0xff + /* Flag whether block is compressed or uncompressed, bit is set if block is * uncompressed */ #define SQUASHFS_COMPRESSED_BIT (1 << 15) @@ -174,6 +182,24 @@ #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ sizeof(u64)) +/* xattr id lookup table defines */ +#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) + +#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\ + sizeof(u64)) +#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16)) + +#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff)) /* cached data constants for filesystem */ #define SQUASHFS_CACHED_BLKS 8 @@ -228,7 +254,7 @@ struct squashfs_super_block { __le64 root_inode; __le64 bytes_used; __le64 id_table_start; - __le64 xattr_table_start; + __le64 xattr_id_table_start; __le64 inode_table_start; __le64 directory_table_start; __le64 fragment_table_start; @@ -261,6 +287,17 @@ struct squashfs_ipc_inode { __le32 nlink; }; +struct squashfs_lipc_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 xattr; +}; + struct squashfs_dev_inode { __le16 inode_type; __le16 mode; @@ -272,6 +309,18 @@ struct squashfs_dev_inode { __le32 rdev; }; +struct squashfs_ldev_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 rdev; + __le32 xattr; +}; + struct squashfs_symlink_inode { __le16 inode_type; __le16 mode; @@ -349,12 +398,14 @@ struct squashfs_ldir_inode { union squashfs_inode { struct squashfs_base_inode base; struct squashfs_dev_inode dev; + struct squashfs_ldev_inode ldev; struct squashfs_symlink_inode symlink; struct squashfs_reg_inode reg; struct squashfs_lreg_inode lreg; struct squashfs_dir_inode dir; struct squashfs_ldir_inode ldir; struct squashfs_ipc_inode ipc; + struct squashfs_lipc_inode lipc; }; struct squashfs_dir_entry { @@ -377,4 +428,27 @@ struct squashfs_fragment_entry { unsigned int unused; }; +struct squashfs_xattr_entry { + __le16 type; + __le16 size; + char data[0]; +}; + +struct squashfs_xattr_val { + __le32 vsize; + char value[0]; +}; + +struct squashfs_xattr_id { + __le64 xattr; + __le32 count; + __le32 size; +}; + +struct squashfs_xattr_id_table { + __le64 xattr_table_start; + __le32 xattr_ids; + __le32 unused; +}; + #endif diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index fbfca30..d3e3a37 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -26,6 +26,9 @@ struct squashfs_inode_info { u64 start; int offset; + u64 xattr; + unsigned int xattr_size; + int xattr_count; union { struct { u64 fragment_block; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 2e77dc5..d9037a5 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -61,6 +61,7 @@ struct squashfs_sb_info { int next_meta_index; __le64 *id_table; __le64 *fragment_index; + __le64 *xattr_id_table; struct mutex read_data_mutex; struct mutex meta_index_mutex; struct meta_index *meta_index; @@ -68,9 +69,11 @@ struct squashfs_sb_info { __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; + u64 xattr_table; unsigned int block_size; unsigned short block_log; long long bytes_used; unsigned int inodes; + int xattr_ids; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 48b6f4a..88b4f86 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -36,12 +36,14 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/magic.h> +#include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" +#include "xattr.h" static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; @@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) long long root_inode; unsigned short flags; unsigned int fragments; - u64 lookup_table_start; + u64 lookup_table_start, xattr_id_table_start; int err; TRACE("Entered squashfs_fill_superblock\n"); @@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) if (msblk->decompressor == NULL) goto failed_mount; - /* - * Check if there's xattrs in the filesystem. These are not - * supported in this version, so warn that they will be ignored. - */ - if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK) - ERROR("Xattrs in filesystem, these will be ignored\n"); - /* Check the filesystem does not extend beyond the end of the block device */ msblk->bytes_used = le64_to_cpu(sblk->bytes_used); @@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) allocate_lookup_table: lookup_table_start = le64_to_cpu(sblk->lookup_table_start); if (lookup_table_start == SQUASHFS_INVALID_BLK) - goto allocate_root; + goto allocate_xattr_table; /* Allocate and read inode lookup table */ msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, @@ -266,6 +261,21 @@ allocate_lookup_table: sb->s_export_op = &squashfs_export_ops; +allocate_xattr_table: + sb->s_xattr = squashfs_xattr_handlers; + xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start); + if (xattr_id_table_start == SQUASHFS_INVALID_BLK) + goto allocate_root; + + /* Allocate and read xattr id lookup table */ + msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, + xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); + if (IS_ERR(msblk->xattr_id_table)) { + err = PTR_ERR(msblk->xattr_id_table); + msblk->xattr_id_table = NULL; + if (err != -ENOTSUPP) + goto failed_mount; + } allocate_root: root = new_inode(sb); if (!root) { @@ -301,6 +311,7 @@ failed_mount: kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); + kfree(msblk->xattr_id_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; kfree(sblk); @@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb) kfree(sbi->fragment_index); kfree(sbi->meta_index); kfree(sbi->inode_lookup_table); + kfree(sbi->xattr_id_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 32b911f..ec86434 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -35,11 +35,13 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/pagemap.h> +#include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "xattr.h" static int squashfs_symlink_readpage(struct file *file, struct page *page) { @@ -114,3 +116,12 @@ error_out: const struct address_space_operations squashfs_symlink_aops = { .readpage = squashfs_symlink_readpage }; + +const struct inode_operations squashfs_symlink_inode_ops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .getxattr = generic_getxattr, + .listxattr = squashfs_listxattr +}; + diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c new file mode 100644 index 0000000..c7655e8 --- /dev/null +++ b/fs/squashfs/xattr.c @@ -0,0 +1,323 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr_id.c + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/xattr.h> +#include <linux/slab.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static const struct xattr_handler *squashfs_xattr_handler(int); + +ssize_t squashfs_listxattr(struct dentry *d, char *buffer, + size_t buffer_size) +{ + struct inode *inode = d->d_inode; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) + + msblk->xattr_table; + int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr); + int count = squashfs_i(inode)->xattr_count; + size_t rest = buffer_size; + int err; + + /* check that the file system has xattrs */ + if (msblk->xattr_id_table == NULL) + return -EOPNOTSUPP; + + /* loop reading each xattr name */ + while (count--) { + struct squashfs_xattr_entry entry; + struct squashfs_xattr_val val; + const struct xattr_handler *handler; + int name_size, prefix_size = 0; + + err = squashfs_read_metadata(sb, &entry, &start, &offset, + sizeof(entry)); + if (err < 0) + goto failed; + + name_size = le16_to_cpu(entry.size); + handler = squashfs_xattr_handler(le16_to_cpu(entry.type)); + if (handler) + prefix_size = handler->list(d, buffer, rest, NULL, + name_size, handler->flags); + if (prefix_size) { + if (buffer) { + if (prefix_size + name_size + 1 > rest) { + err = -ERANGE; + goto failed; + } + buffer += prefix_size; + } + err = squashfs_read_metadata(sb, buffer, &start, + &offset, name_size); + if (err < 0) + goto failed; + if (buffer) { + buffer[name_size] = '\0'; + buffer += name_size + 1; + } + rest -= prefix_size + name_size + 1; + } else { + /* no handler or insuffficient privileges, so skip */ + err = squashfs_read_metadata(sb, NULL, &start, + &offset, name_size); + if (err < 0) + goto failed; + } + + + /* skip remaining xattr entry */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + + err = squashfs_read_metadata(sb, NULL, &start, &offset, + le32_to_cpu(val.vsize)); + if (err < 0) + goto failed; + } + err = buffer_size - rest; + +failed: + return err; +} + + +static int squashfs_xattr_get(struct inode *inode, int name_index, + const char *name, void *buffer, size_t buffer_size) +{ + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) + + msblk->xattr_table; + int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr); + int count = squashfs_i(inode)->xattr_count; + int name_len = strlen(name); + int err, vsize; + char *target = kmalloc(name_len, GFP_KERNEL); + + if (target == NULL) + return -ENOMEM; + + /* loop reading each xattr name */ + for (; count; count--) { + struct squashfs_xattr_entry entry; + struct squashfs_xattr_val val; + int type, prefix, name_size; + + err = squashfs_read_metadata(sb, &entry, &start, &offset, + sizeof(entry)); + if (err < 0) + goto failed; + + name_size = le16_to_cpu(entry.size); + type = le16_to_cpu(entry.type); + prefix = type & SQUASHFS_XATTR_PREFIX_MASK; + + if (prefix == name_index && name_size == name_len) + err = squashfs_read_metadata(sb, target, &start, + &offset, name_size); + else + err = squashfs_read_metadata(sb, NULL, &start, + &offset, name_size); + if (err < 0) + goto failed; + + if (prefix == name_index && name_size == name_len && + strncmp(target, name, name_size) == 0) { + /* found xattr */ + if (type & SQUASHFS_XATTR_VALUE_OOL) { + __le64 xattr; + /* val is a reference to the real location */ + err = squashfs_read_metadata(sb, &val, &start, + &offset, sizeof(val)); + if (err < 0) + goto failed; + err = squashfs_read_metadata(sb, &xattr, &start, + &offset, sizeof(xattr)); + if (err < 0) + goto failed; + xattr = le64_to_cpu(xattr); + start = SQUASHFS_XATTR_BLK(xattr) + + msblk->xattr_table; + offset = SQUASHFS_XATTR_OFFSET(xattr); + } + /* read xattr value */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + + vsize = le32_to_cpu(val.vsize); + if (buffer) { + if (vsize > buffer_size) { + err = -ERANGE; + goto failed; + } + err = squashfs_read_metadata(sb, buffer, &start, + &offset, vsize); + if (err < 0) + goto failed; + } + break; + } + + /* no match, skip remaining xattr entry */ + err = squashfs_read_metadata(sb, &val, &start, &offset, + sizeof(val)); + if (err < 0) + goto failed; + err = squashfs_read_metadata(sb, NULL, &start, &offset, + le32_to_cpu(val.vsize)); + if (err < 0) + goto failed; + } + err = count ? vsize : -ENODATA; + +failed: + kfree(target); + return err; +} + + +/* + * User namespace support + */ +static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + if (list && XATTR_USER_PREFIX_LEN <= list_size) + memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + return XATTR_USER_PREFIX_LEN; +} + +static int squashfs_user_get(struct dentry *d, const char *name, void *buffer, + size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = squashfs_user_list, + .get = squashfs_user_get +}; + +/* + * Trusted namespace support + */ +static size_t squashfs_trusted_list(struct dentry *d, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size) + memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + return XATTR_TRUSTED_PREFIX_LEN; +} + +static int squashfs_trusted_get(struct dentry *d, const char *name, + void *buffer, size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = squashfs_trusted_list, + .get = squashfs_trusted_get +}; + +/* + * Security namespace support + */ +static size_t squashfs_security_list(struct dentry *d, char *list, + size_t list_size, const char *name, size_t name_len, int type) +{ + if (list && XATTR_SECURITY_PREFIX_LEN <= list_size) + memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); + return XATTR_SECURITY_PREFIX_LEN; +} + +static int squashfs_security_get(struct dentry *d, const char *name, + void *buffer, size_t size, int type) +{ + if (name[0] == '\0') + return -EINVAL; + + return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name, + buffer, size); +} + +static const struct xattr_handler squashfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = squashfs_security_list, + .get = squashfs_security_get +}; + +static inline const struct xattr_handler *squashfs_xattr_handler(int type) +{ + if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL)) + /* ignore unrecognised type */ + return NULL; + + switch (type & SQUASHFS_XATTR_PREFIX_MASK) { + case SQUASHFS_XATTR_USER: + return &squashfs_xattr_user_handler; + case SQUASHFS_XATTR_TRUSTED: + return &squashfs_xattr_trusted_handler; + case SQUASHFS_XATTR_SECURITY: + return &squashfs_xattr_security_handler; + default: + /* ignore unrecognised type */ + return NULL; + } +} + +const struct xattr_handler *squashfs_xattr_handlers[] = { + &squashfs_xattr_user_handler, + &squashfs_xattr_trusted_handler, + &squashfs_xattr_security_handler, + NULL +}; + diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h new file mode 100644 index 0000000..9da071a --- /dev/null +++ b/fs/squashfs/xattr.h @@ -0,0 +1,46 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr.h + */ + +#ifdef CONFIG_SQUASHFS_XATTRS +extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, + u64 *, int *); +extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, + int *, unsigned long long *); +#else +static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, + u64 start, u64 *xattr_table_start, int *xattr_ids) +{ + ERROR("Xattrs in filesystem, these will be ignored\n"); + return ERR_PTR(-ENOTSUPP); +} + +static inline int squashfs_xattr_lookup(struct super_block *sb, + unsigned int index, int *count, int *size, + unsigned long long *xattr) +{ + return 0; +} +#define squashfs_listxattr NULL +#define generic_getxattr NULL +#define squashfs_xattr_handlers NULL +#endif diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c new file mode 100644 index 0000000..cfb4110 --- /dev/null +++ b/fs/squashfs/xattr_id.c @@ -0,0 +1,100 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * xattr_id.c + */ + +/* + * This file implements code to map the 32-bit xattr id stored in the inode + * into the on disk location of the xattr data. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Map xattr id using the xattr id look up table + */ +int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, + int *count, unsigned int *size, unsigned long long *xattr) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_XATTR_BLOCK(index); + int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index); + u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]); + struct squashfs_xattr_id id; + int err; + + err = squashfs_read_metadata(sb, &id, &start_block, &offset, + sizeof(id)); + if (err < 0) + return err; + + *xattr = le64_to_cpu(id.xattr); + *size = le32_to_cpu(id.size); + *count = le32_to_cpu(id.count); + return 0; +} + + +/* + * Read uncompressed xattr id lookup table indexes from disk into memory + */ +__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, + u64 *xattr_table_start, int *xattr_ids) +{ + unsigned int len; + __le64 *xid_table; + struct squashfs_xattr_id_table id_table; + int err; + + err = squashfs_read_table(sb, &id_table, start, sizeof(id_table)); + if (err < 0) { + ERROR("unable to read xattr id table\n"); + return ERR_PTR(err); + } + *xattr_table_start = le64_to_cpu(id_table.xattr_table_start); + *xattr_ids = le32_to_cpu(id_table.xattr_ids); + len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); + + TRACE("In read_xattr_index_table, length %d\n", len); + + /* Allocate xattr id lookup table indexes */ + xid_table = kmalloc(len, GFP_KERNEL); + if (xid_table == NULL) { + ERROR("Failed to allocate xattr id index table\n"); + return ERR_PTR(-ENOMEM); + } + + err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len); + if (err < 0) { + ERROR("unable to read xattr id index table\n"); + kfree(xid_table); + return ERR_PTR(err); + } + + return xid_table; +} @@ -24,7 +24,6 @@ #include <linux/slab.h> #include <linux/acct.h> #include <linux/blkdev.h> -#include <linux/quotaops.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ @@ -94,8 +93,6 @@ static struct super_block *alloc_super(struct file_system_type *type) init_rwsem(&s->s_dquot.dqptr_sem); init_waitqueue_head(&s->s_wait_unfrozen); s->s_maxbytes = MAX_NON_LFS; - s->dq_op = sb_dquot_ops; - s->s_qcop = sb_quotactl_ops; s->s_op = &default_op; s->s_time_gran = 1000000000; } @@ -160,7 +157,6 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - vfs_dq_off(s, 0); fs->kill_sb(s); put_filesystem(fs); put_super(s); @@ -524,7 +520,7 @@ rescan: int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; - int remount_rw, remount_ro; + int remount_ro; if (sb->s_frozen != SB_UNFROZEN) return -EBUSY; @@ -540,7 +536,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) sync_filesystem(sb); remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); - remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ @@ -549,9 +544,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) mark_files_ro(sb); else if (!fs_may_remount_ro(sb)) return -EBUSY; - retval = vfs_dq_off(sb, 1); - if (retval < 0 && retval != -ENOSYS) - return -EBUSY; } if (sb->s_op->remount_fs) { @@ -560,8 +552,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return retval; } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); - if (remount_rw) - vfs_dq_quota_on_remount(sb); + /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in @@ -946,8 +937,8 @@ out: EXPORT_SYMBOL_GPL(vfs_kern_mount); /** - * freeze_super -- lock the filesystem and force it into a consistent state - * @super: the super to lock + * freeze_super - lock the filesystem and force it into a consistent state + * @sb: the super to lock * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs will return @@ -130,12 +130,10 @@ void emergency_sync(void) /* * Generic function to fsync a file. - * - * filp may be NULL if called via the msync of a vma. */ -int file_fsync(struct file *filp, struct dentry *dentry, int datasync) +int file_fsync(struct file *filp, int datasync) { - struct inode * inode = dentry->d_inode; + struct inode *inode = filp->f_mapping->host; struct super_block * sb; int ret, err; @@ -183,7 +181,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) * livelocks in fsync_buffers_list(). */ mutex_lock(&mapping->host->i_mutex); - err = file->f_op->fsync(file, file->f_path.dentry, datasync); + err = file->f_op->fsync(file, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index bbd77e9..bde1a4c 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -117,13 +117,11 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) if (error) goto out; - iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ - - error = inode_setattr(inode, iattr); - if (error) - goto out; + /* this ignores size changes */ + generic_setattr(inode, iattr); error = sysfs_sd_setattr(sd, iattr); + out: mutex_unlock(&sysfs_mutex); return error; diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 1dabed2..79941e4 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = sysv_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 96340c0..750cc22 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 4573734..d4a5380 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait) * then attach current time stamp. * But if the filesystem was marked clean, keep it clean. */ + sb->s_dirt = 0; old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); if (sbi->s_type == FSTYPE_SYSV4) { if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5692cf7..12f445c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -967,12 +967,15 @@ static int do_writepage(struct page *page, int len) * the page locked, and it locks @ui_mutex. However, write-back does take inode * @i_mutex, which means other VFS operations may be run on this inode at the * same time. And the problematic one is truncation to smaller size, from where - * we have to call 'vmtruncate()', which first changes @inode->i_size, then + * we have to call 'simple_setsize()', which first changes @inode->i_size, then * drops the truncated pages. And while dropping the pages, it takes the page - * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with + * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This * means that @inode->i_size is changed while @ui_mutex is unlocked. * + * XXX: with the new truncate the above is not true anymore, the simple_setsize + * calls can be replaced with the individual components. + * * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond * inode size. How do we do this if @inode->i_size may became smaller while we * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the @@ -1125,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, budgeted = 0; } - err = vmtruncate(inode, new_size); + err = simple_setsize(inode, new_size); if (err) goto out_budg; @@ -1214,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (attr->ia_valid & ATTR_SIZE) { dbg_gen("size %lld -> %lld", inode->i_size, new_size); - err = vmtruncate(inode, new_size); + err = simple_setsize(inode, new_size); if (err) goto out; } @@ -1223,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); - /* 'vmtruncate()' changed @i_size, update @ui_size */ + /* 'simple_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1304,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) +int ubifs_fsync(struct file *file, int datasync) { - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; int err; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index bd2542d..2eef553 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb { * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot * make sure @inode->i_size is always changed under @ui_mutex, because it - * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock + * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock * with 'ubifs_writepage()' (see file.c). All the other inode fields are * changed under @ui_mutex, so they do not need "shadow" fields. Note, one * could consider to rework locking and base it on "shadow" fields. @@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ -int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); +int ubifs_fsync(struct file *file, int datasync); int ubifs_setattr(struct dentry *dentry, struct iattr *attr); /* dir.c */ diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 9a9378b..b608efa 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -21,7 +21,6 @@ #include "udfdecl.h" -#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bitops.h> @@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb, udf_debug("byte=%2x\n", ((char *)bh->b_data)[(bit + i) >> 3]); } else { - if (inode) - dquot_free_block(inode, 1); udf_add_free_space(sb, sbi->s_partition, 1); } } @@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, bit = block % (sb->s_blocksize << 3); while (bit < (sb->s_blocksize << 3) && block_count > 0) { - if (!udf_test_bit(bit, bh->b_data)) + if (!udf_clear_bit(bit, bh->b_data)) goto out; - else if (dquot_prealloc_block(inode, 1)) - goto out; - else if (!udf_clear_bit(bit, bh->b_data)) { - udf_debug("bit already cleared for block %d\n", bit); - dquot_free_block(inode, 1); - goto out; - } block_count--; alloc_count++; bit++; @@ -338,20 +328,6 @@ search_back: } got_block: - - /* - * Check quota for allocation of this block. - */ - if (inode) { - int ret = dquot_alloc_block(inode, 1); - - if (ret) { - mutex_unlock(&sbi->s_alloc_mutex); - *err = ret; - return 0; - } - } - newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - (sizeof(struct spaceBitmapDesc) << 3); @@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb, } iinfo = UDF_I(table); - /* We do this up front - There are some error conditions that - could occure, but.. oh well */ - if (inode) - dquot_free_block(inode, count); udf_add_free_space(sb, sbi->s_partition, count); start = bloc->logicalBlockNum + offset; @@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, epos.offset -= adsize; alloc_count = (elen >> sb->s_blocksize_bits); - if (inode && dquot_prealloc_block(inode, - alloc_count > block_count ? block_count : alloc_count)) - alloc_count = 0; - else if (alloc_count > block_count) { + if (alloc_count > block_count) { alloc_count = block_count; eloc.logicalBlockNum += alloc_count; elen -= (alloc_count << sb->s_blocksize_bits); @@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb, newblock = goal_eloc.logicalBlockNum; goal_eloc.logicalBlockNum++; goal_elen -= sb->s_blocksize; - if (inode) { - *err = dquot_alloc_block(inode, 1); - if (*err) { - brelse(goal_epos.bh); - mutex_unlock(&sbi->s_alloc_mutex); - return 0; - } - } if (goal_elen) udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 3a84455..51552bf 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) /* readdir and lookup functions */ const struct file_operations udf_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = udf_readdir, .unlocked_ioctl = udf_ioctl, - .fsync = simple_fsync, + .fsync = generic_file_fsync, }; diff --git a/fs/udf/file.c b/fs/udf/file.c index baae3a7..94e06d6 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,7 +34,6 @@ #include <linux/errno.h> #include <linux/smp_lock.h> #include <linux/pagemap.h> -#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/aio.h> #include <linux/smp_lock.h> @@ -219,39 +218,16 @@ const struct file_operations udf_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .unlocked_ioctl = udf_ioctl, - .open = dquot_file_open, + .open = generic_file_open, .mmap = generic_file_mmap, .write = do_sync_write, .aio_write = udf_file_aio_write, .release = udf_release_file, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, .llseek = generic_file_llseek, }; -int udf_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - int error; - - error = inode_change_ok(inode, iattr); - if (error) - return error; - - if (is_quota_modification(inode, iattr)) - dquot_initialize(inode); - - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - error = dquot_transfer(inode, iattr); - if (error) - return error; - } - - return inode_setattr(inode, iattr); -} - const struct inode_operations udf_file_inode_operations = { .truncate = udf_truncate, - .setattr = udf_setattr, }; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 2b5586c..18cd711 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -20,7 +20,6 @@ #include "udfdecl.h" #include <linux/fs.h> -#include <linux/quotaops.h> #include <linux/sched.h> #include <linux/slab.h> @@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode) struct super_block *sb = inode->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); - /* - * Note: we must free any quota before locking the superblock, - * as writing the quota to disk may need the lock as well. - */ - dquot_free_inode(inode); - dquot_drop(inode); - clear_inode(inode); mutex_lock(&sbi->s_alloc_mutex); @@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; - int block, ret; + int block; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); @@ -146,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) insert_inode_hash(inode); mark_inode_dirty(inode); - dquot_initialize(inode); - ret = dquot_alloc_inode(inode); - if (ret) { - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; - iput(inode); - *err = ret; - return NULL; - } - *err = 0; return inode; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 8a3fbd1..124852b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -36,7 +36,6 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> @@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); void udf_delete_inode(struct inode *inode) { - if (!is_bad_inode(inode)) - dquot_initialize(inode); - truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode) (unsigned long long)iinfo->i_lenExtents); } - dquot_drop(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 585f733..bf5fc67 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -27,7 +27,6 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/quotaops.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/sched.h> @@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, int err; struct udf_inode_info *iinfo; - dquot_initialize(dir); - lock_kernel(); inode = udf_new_inode(dir, mode, &err); if (!inode) { @@ -617,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!old_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); - lock_kernel(); err = -EIO; inode = udf_new_inode(dir, mode, &err); @@ -664,8 +659,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; - dquot_initialize(dir); - lock_kernel(); err = -EMLINK; if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) @@ -800,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; - dquot_initialize(dir); - retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -848,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct fileIdentDesc cfi; struct kernel_lb_addr tloc; - dquot_initialize(dir); - retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -904,8 +893,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct buffer_head *bh; struct udf_inode_info *iinfo; - dquot_initialize(dir); - lock_kernel(); inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); if (!inode) @@ -1075,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, int err; struct buffer_head *bh; - dquot_initialize(dir); - lock_kernel(); if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { unlock_kernel(); @@ -1139,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); - dquot_initialize(old_dir); - dquot_initialize(new_dir); - lock_kernel(); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { @@ -1387,7 +1369,6 @@ const struct export_operations udf_export_ops = { const struct inode_operations udf_dir_inode_operations = { .lookup = udf_lookup, .create = udf_create, - .setattr = udf_setattr, .link = udf_link, .unlink = udf_unlink, .symlink = udf_symlink, @@ -1400,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, - .setattr = udf_setattr, }; diff --git a/fs/udf/super.c b/fs/udf/super.c index 1e4543c..612d1e2 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) { struct udf_options uopt; struct udf_sb_info *sbi = UDF_SB(sb); + int error = 0; uopt.flags = sbi->s_flags; uopt.uid = sbi->s_uid; @@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) *flags |= MS_RDONLY; } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { - unlock_kernel(); - return 0; - } + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out_unlock; + if (*flags & MS_RDONLY) udf_close_lvid(sb); else udf_open_lvid(sb); +out_unlock: unlock_kernel(); - return 0; + return error; } /* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ @@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* Fill in the rest of the superblock */ sb->s_op = &udf_sb_ops; sb->s_export_op = &udf_export_ops; - sb->dq_op = NULL; + sb->s_dirt = 0; sb->s_magic = UDF_SUPER_MAGIC; sb->s_time_gran = 1000; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 9079ff7..2bac035 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -131,7 +131,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); -extern int udf_setattr(struct dentry *dentry, struct iattr *iattr); /* inode.c */ extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); extern int udf_sync_inode(struct inode *); diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 5cfa4d8..048484f 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -12,7 +12,6 @@ #include <linux/stat.h> #include <linux/time.h> #include <linux/string.h> -#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/bitops.h> @@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) "bit already cleared for fragment %u", i); } - dquot_free_block(inode, count); - - fs32_add(sb, &ucg->cg_cs.cs_nffree, count); uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -195,7 +191,6 @@ do_more: ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); - dquot_free_block(inode, uspi->s_fpb); fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree++; @@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; - int ret; UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", (unsigned long long)fragment, oldcount, newcount); @@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); for (i = oldcount; i < newcount; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); - ret = dquot_alloc_block(inode, count); - if (ret) { - *err = ret; - return 0; - } fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; u64 result; - int ret; UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", inode->i_ino, cgno, (unsigned long long)goal, count); @@ -667,7 +655,6 @@ cg_found: for (i = count; i < uspi->s_fpb; i++) ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; - dquot_free_block(inode, i); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; @@ -679,11 +666,6 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; - ret = dquot_alloc_block(inode, count); - if (ret) { - *err = ret; - return 0; - } for (i = 0; i < count; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); @@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode, struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; - int ret; UFSD("ENTER, goal %llu\n", (unsigned long long)goal); @@ -752,11 +733,6 @@ gotit: ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, -1); - ret = dquot_alloc_block(inode, uspi->s_fpb); - if (ret) { - *err = ret; - return INVBLOCK; - } fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree--; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 317a0d4..ec78475 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -666,6 +666,6 @@ not_empty: const struct file_operations ufs_dir_operations = { .read = generic_read_dir, .readdir = ufs_readdir, - .fsync = simple_fsync, + .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/ufs/file.c b/fs/ufs/file.c index a8962ce..33afa20 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -24,7 +24,6 @@ */ #include <linux/fs.h> -#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .open = dquot_file_open, - .fsync = simple_fsync, + .open = generic_file_open, + .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 3a959d5..594480e 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -27,7 +27,6 @@ #include <linux/time.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/sched.h> #include <linux/bitops.h> @@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode) is_directory = S_ISDIR(inode->i_mode); - dquot_free_inode(inode); - dquot_drop(inode); - clear_inode (inode); if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) @@ -347,21 +343,12 @@ cg_found: unlock_super (sb); - dquot_initialize(inode); - err = dquot_alloc_inode(inode); - if (err) { - dquot_drop(inode); - goto fail_without_unlock; - } - UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: unlock_super(sb); -fail_without_unlock: - inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index cffa756..73fe773 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -37,7 +37,6 @@ #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/writeback.h> -#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode) { loff_t old_i_size; - if (!is_bad_inode(inode)) - dquot_initialize(inode); - truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index eabc02e..b056f02 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -30,7 +30,6 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/smp_lock.h> -#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, UFSD("BEGIN\n"); - dquot_initialize(dir); - inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); @@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t if (!old_valid_dev(rdev)) return -EINVAL; - dquot_initialize(dir); - inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; - dquot_initialize(dir); - lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); @@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return -EMLINK; } - dquot_initialize(dir); - inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= UFS_LINK_MAX) goto out; - dquot_initialize(dir); - lock_kernel(); inode_inc_link_count(dir); @@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; - dquot_initialize(dir); - de = ufs_find_entry(dir, &dentry->d_name, &page); if (!de) goto out; @@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; - dquot_initialize(old_dir); - dquot_initialize(new_dir); - old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 14743d9..3ec5a9e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -77,7 +77,6 @@ #include <linux/errno.h> #include <linux/fs.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/stat.h> @@ -918,6 +917,7 @@ again: sbi->s_bytesex = BYTESEX_LE; switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { case UFS_MAGIC: + case UFS_MAGIC_BW: case UFS2_MAGIC: case UFS_MAGIC_LFN: case UFS_MAGIC_FEA: @@ -927,6 +927,7 @@ again: sbi->s_bytesex = BYTESEX_BE; switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { case UFS_MAGIC: + case UFS_MAGIC_BW: case UFS2_MAGIC: case UFS_MAGIC_LFN: case UFS_MAGIC_FEA: @@ -1045,7 +1046,7 @@ magic_found: */ sb->s_op = &ufs_super_ops; sb->s_export_op = &ufs_export_ops; - sb->dq_op = NULL; /***/ + sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); @@ -1435,126 +1436,19 @@ static void destroy_inodecache(void) kmem_cache_destroy(ufs_inode_cachep); } -static void ufs_clear_inode(struct inode *inode) -{ - dquot_drop(inode); -} - -#ifdef CONFIG_QUOTA -static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); -static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); -#endif - static const struct super_operations ufs_super_ops = { .alloc_inode = ufs_alloc_inode, .destroy_inode = ufs_destroy_inode, .write_inode = ufs_write_inode, .delete_inode = ufs_delete_inode, - .clear_inode = ufs_clear_inode, .put_super = ufs_put_super, .write_super = ufs_write_super, .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, .remount_fs = ufs_remount, .show_options = ufs_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ufs_quota_read, - .quota_write = ufs_quota_write, -#endif }; -#ifdef CONFIG_QUOTA - -/* Read data from quotafile - avoid pagecache and such because we cannot afford - * acquiring the locks... As quota files are never truncated and quota code - * itself serializes the operations (and noone else should touch the files) - * we don't have to be afraid of races */ -static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - sector_t blk = off >> sb->s_blocksize_bits; - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int tocopy; - size_t toread; - struct buffer_head *bh; - loff_t i_size = i_size_read(inode); - - if (off > i_size) - return 0; - if (off+len > i_size) - len = i_size-off; - toread = len; - while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; - - bh = ufs_bread(inode, blk, 0, &err); - if (err) - return err; - if (!bh) /* A hole? */ - memset(data, 0, tocopy); - else { - memcpy(data, bh->b_data+offset, tocopy); - brelse(bh); - } - offset = 0; - toread -= tocopy; - data += tocopy; - blk++; - } - return len; -} - -/* Write to quotafile */ -static ssize_t ufs_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - sector_t blk = off >> sb->s_blocksize_bits; - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int tocopy; - size_t towrite = len; - struct buffer_head *bh; - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - - bh = ufs_bread(inode, blk, 1, &err); - if (!bh) - goto out; - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - unlock_buffer(bh); - brelse(bh); - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; - } -out: - if (len == towrite) { - mutex_unlock(&inode->i_mutex); - return err; - } - if (inode->i_size < off+len-towrite) - i_size_write(inode, off+len-towrite); - inode->i_version++; - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); - return len - towrite; -} - -#endif - static int ufs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index f294c44..589e01a 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -44,7 +44,6 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/sched.h> -#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -501,12 +500,10 @@ out: return err; } - /* - * We don't define our `inode->i_op->truncate', and call it here, - * because of: - * - there is no way to know old size - * - there is no way inform user about error, if it happens in `truncate' + * TODO: + * - truncate case should use proper ordering instead of using + * simple_setsize */ int ufs_setattr(struct dentry *dentry, struct iattr *attr) { @@ -518,19 +515,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); - - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - error = dquot_transfer(inode, attr); - if (error) - return error; - } if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { loff_t old_i_size = inode->i_size; - error = vmtruncate(inode, attr->ia_size); + error = simple_setsize(inode, attr->ia_size); if (error) return error; error = ufs_truncate(inode, old_i_size); diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 6943ec6..8aba544 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16; #define UFS_SECTOR_SIZE 512 #define UFS_SECTOR_BITS 9 #define UFS_MAGIC 0x00011954 +#define UFS_MAGIC_BW 0x0f242697 #define UFS2_MAGIC 0x19540119 #define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b4769e4..c8fb13f 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \ xfs_itable.o \ xfs_dfrag.o \ xfs_log.o \ + xfs_log_cil.o \ xfs_log_recover.o \ xfs_mount.o \ xfs_mru_cache.o \ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index f01de3c..649ade8 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -37,6 +37,7 @@ #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -850,6 +851,12 @@ xfs_buf_lock_value( * Note that this in no way locks the underlying pages, so it is only * useful for synchronizing concurrent use of buffer objects, not for * synchronizing independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. */ void xfs_buf_lock( @@ -857,6 +864,8 @@ xfs_buf_lock( { trace_xfs_buf_lock(bp, _RET_IP_); + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_mount, 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index d8fb1b5..257a56b 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -100,10 +100,10 @@ xfs_iozero( STATIC int xfs_file_fsync( struct file *file, - struct dentry *dentry, int datasync) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); struct xfs_trans *tp; int error = 0; int log_flushed = 0; @@ -140,8 +140,8 @@ xfs_file_fsync( * might gets cleared when the inode gets written out via the AIL * or xfs_iflush_cluster. */ - if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || - ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && + if (((inode->i_state & I_DIRTY_DATASYNC) || + ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && ip->i_update_core) { /* * Kick off a transaction to log the inode core to get the @@ -868,7 +868,7 @@ write_retry: mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); - error2 = -xfs_file_fsync(file, file->f_path.dentry, + error2 = -xfs_file_fsync(file, (file->f_flags & __O_SYNC) ? 0 : 1); if (!error) error = error2; diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index e31bf21..9ac8aea 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -19,6 +19,7 @@ #include "xfs_dmapi.h" #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f24dbe5..f2d1718 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ /* * Table driven mount option parser. @@ -374,6 +376,13 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DMI)) { mp->m_flags |= XFS_MOUNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { + mp->m_flags |= XFS_MOUNT_DELAYLOG; + cmn_err(CE_WARN, + "Enabling EXPERIMENTAL delayed logging feature " + "- use at your own risk.\n"); + } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { + mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); @@ -535,6 +544,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -1755,7 +1765,7 @@ xfs_init_zones(void) * but it is much faster. */ xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_trans_zone; diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 8a319cf..ff6bc79 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap, ); +#define XFS_BUSY_SYNC \ + { 0, "async" }, \ + { 1, "sync" } + TRACE_EVENT(xfs_alloc_busy, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, int slot), - TP_ARGS(mp, agno, agbno, len, slot), + TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int sync), + TP_ARGS(trans, agno, agbno, len, sync), TP_STRUCT__entry( __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(int, tid) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, slot) + __field(int, sync) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->tid = trans->t_ticket->t_tid; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->slot = slot; + __entry->sync = sync; ), - TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", + TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->tid, __entry->agno, __entry->agbno, __entry->len, - __entry->slot) + __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) ); -#define XFS_BUSY_STATES \ - { 0, "found" }, \ - { 1, "missing" } - TRACE_EVENT(xfs_alloc_unbusy, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int slot, int found), - TP_ARGS(mp, agno, slot, found), + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(int, slot) - __field(int, found) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->slot = slot; - __entry->found = found; + __entry->agbno = agbno; + __entry->len = len; ), - TP_printk("dev %d:%d agno %u slot %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->slot, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->agbno, + __entry->len) ); +#define XFS_BUSY_STATES \ + { 0, "missing" }, \ + { 1, "found" } + TRACE_EVENT(xfs_alloc_busysearch, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, xfs_lsn_t lsn), - TP_ARGS(mp, agno, agbno, len, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int found), + TP_ARGS(mp, agno, agbno, len, found), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_lsn_t, lsn) + __field(int, found) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->lsn = lsn; + __entry->found = found; ), - TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", + TP_printk("dev %d:%d agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, + __print_symbolic(__entry->found, XFS_BUSY_STATES)) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, __entry->lsn) ); diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index b89ec5d..585e763 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk( for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) xfs_qm_dqinit_core(curid, type, d); xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : - XFS_BLI_GDQUOT_BUF))); + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index abb8222..401f364 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,14 +175,20 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Used in perag to mark blocks that have been freed - * but whose transactions aren't committed to disk yet. + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_alloc_busy_insert() for details. */ -typedef struct xfs_perag_busy { - xfs_agblock_t busy_start; - xfs_extlen_t busy_length; - struct xfs_trans *busy_tp; /* transaction that did the free */ -} xfs_perag_busy_t; +struct xfs_busy_extent { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + xlog_tid_t tid; /* transaction that created this */ +}; /* * Per-ag incore structure, copies of information in agf and agi, @@ -216,7 +222,8 @@ typedef struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; #ifdef __KERNEL__ - spinlock_t pagb_lock; /* lock for pagb_list */ + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */ @@ -226,7 +233,6 @@ typedef struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ #endif int pagb_count; /* pagb slots in use */ - xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ } xfs_perag_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 94cddbf..a7fbe8a 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -46,11 +46,9 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +static int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); /* * Prototypes for per-ag allocation routines @@ -540,9 +538,16 @@ xfs_alloc_ag_vextent( be32_to_cpu(agf->agf_length)); xfs_alloc_log_agf(args->tp, args->agbp, XFS_AGF_FREEBLKS); - /* search the busylist for these blocks */ - xfs_alloc_search_busy(args->tp, args->agno, - args->agbno, args->len); + /* + * Search the busylist for these blocks and mark the + * transaction as synchronous if blocks are found. This + * avoids the need to block due to a synchronous log + * force to ensure correct ordering as the synchronous + * transaction will guarantee that for us. + */ + if (xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)) + xfs_trans_set_sync(args->tp); } if (!args->isfl) xfs_trans_mod_sb(args->tp, @@ -1693,7 +1698,7 @@ xfs_free_ag_extent( * when the iclog commits to disk. If a busy block is allocated, * the iclog is pushed up to the LSN that freed the block. */ - xfs_alloc_mark_busy(tp, agno, bno, len); + xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist( *bnop = bno; /* - * As blocks are freed, they are added to the per-ag busy list - * and remain there until the freeing transaction is committed to - * disk. Now that we have allocated blocks, this list must be - * searched to see if a block is being reused. If one is, then - * the freeing transaction must be pushed to disk NOW by forcing - * to disk all iclogs up that transaction's LSN. + * As blocks are freed, they are added to the per-ag busy list and + * remain there until the freeing transaction is committed to disk. + * Now that we have allocated blocks, this list must be searched to see + * if a block is being reused. If one is, then the freeing transaction + * must be pushed to disk before this transaction. + * + * We do this by setting the current transaction to a sync transaction + * which guarantees that the freeing transaction is on disk before this + * transaction. This is done instead of a synchronous log force here so + * that we don't sit and wait with the AGF locked in the transaction + * during the log force. */ - xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); + if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) + xfs_trans_set_sync(tp); return 0; } @@ -2201,7 +2212,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; - memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); + pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; } #ifdef DEBUG @@ -2479,127 +2490,263 @@ error0: * list is reused, the transaction that freed it must be forced to disk * before continuing to use the block. * - * xfs_alloc_mark_busy - add to the per-ag busy list - * xfs_alloc_clear_busy - remove an item from the per-ag busy list + * xfs_alloc_busy_insert - add to the per-ag busy list + * xfs_alloc_busy_clear - remove an item from the per-ag busy list + * xfs_alloc_busy_search - search for a busy extent + */ + +/* + * Insert a new extent into the busy tree. + * + * The busy extent tree is indexed by the start block of the busy extent. + * there can be multiple overlapping ranges in the busy extent tree but only + * ever one entry at a given start block. The reason for this is that + * multi-block extents can be freed, then smaller chunks of that extent + * allocated and freed again before the first transaction commit is on disk. + * If the exact same start block is freed a second time, we have to wait for + * that busy extent to pass out of the tree before the new extent is inserted. + * There are two main cases we have to handle here. + * + * The first case is a transaction that triggers a "free - allocate - free" + * cycle. This can occur during btree manipulations as a btree block is freed + * to the freelist, then allocated from the free list, then freed again. In + * this case, the second extxpnet free is what triggers the duplicate and as + * such the transaction IDs should match. Because the extent was allocated in + * this transaction, the transaction must be marked as synchronous. This is + * true for all cases where the free/alloc/free occurs in the one transaction, + * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. + * This serves to catch violations of the second case quite effectively. + * + * The second case is where the free/alloc/free occur in different + * transactions. In this case, the thread freeing the extent the second time + * can't mark the extent busy immediately because it is already tracked in a + * transaction that may be committing. When the log commit for the existing + * busy extent completes, the busy extent will be removed from the tree. If we + * allow the second busy insert to continue using that busy extent structure, + * it can be freed before this transaction is safely in the log. Hence our + * only option in this case is to force the log to remove the existing busy + * extent from the list before we insert the new one with the current + * transaction ID. + * + * The problem we are trying to avoid in the free-alloc-free in separate + * transactions is most easily described with a timeline: + * + * Thread 1 Thread 2 Thread 3 xfslogd + * xact alloc + * free X + * mark busy + * commit xact + * free xact + * xact alloc + * alloc X + * busy search + * mark xact sync + * commit xact + * free xact + * force log + * checkpoint starts + * .... + * xact alloc + * free X + * mark busy + * finds match + * *** KABOOM! *** + * .... + * log IO completes + * unbusy X + * checkpoint completes + * + * By issuing a log force in thread 3 @ "KABOOM", the thread will block until + * the checkpoint completes, and the busy extent it matched will have been + * removed from the tree when it is woken. Hence it can then continue safely. + * + * However, to ensure this matching process is robust, we need to use the + * transaction ID for identifying transaction, as delayed logging results in + * the busy extent and transaction lifecycles being different. i.e. the busy + * extent is active for a lot longer than the transaction. Hence the + * transaction structure can be freed and reallocated, then mark the same + * extent busy again in the new transaction. In this case the new transaction + * will have a different tid but can have the same address, and hence we need + * to check against the tid. + * + * Future: for delayed logging, we could avoid the log force if the extent was + * first freed in the current checkpoint sequence. This, however, requires the + * ability to pin the current checkpoint in memory until this transaction + * commits to ensure that both the original free and the current one combine + * logically into the one checkpoint. If the checkpoint sequences are + * different, however, we still need to wait on a log force. */ void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +xfs_alloc_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_perag_busy_t *bsy; + struct xfs_busy_extent *new; + struct xfs_busy_extent *busyp; struct xfs_perag *pag; - int n; + struct rb_node **rbp; + struct rb_node *parent; + int match; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - /* search pagb_list for an open slot */ - for (bsy = pag->pagb_list, n = 0; - n < XFS_PAGB_NUM_SLOTS; - bsy++, n++) { - if (bsy->busy_tp == NULL) { - break; - } + new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_alloc_busy(tp, agno, bno, len, 1); + xfs_trans_set_sync(tp); + return; } - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); + new->agno = agno; + new->bno = bno; + new->length = len; + new->tid = xfs_log_get_trans_ident(tp); - if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &pag->pagb_list[n]; - pag->pagb_count++; - bsy->busy_start = bno; - bsy->busy_length = len; - bsy->busy_tp = tp; - xfs_trans_add_busy(tp, agno, n); - } else { + INIT_LIST_HEAD(&new->list); + + /* trace before insert to be able to see failed inserts */ + trace_xfs_alloc_busy(tp, agno, bno, len, 0); + + pag = xfs_perag_get(tp->t_mountp, new->agno); +restart: + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + parent = NULL; + busyp = NULL; + match = 0; + while (*rbp && match >= 0) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); + + if (new->bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + rbp = &(*rbp)->rb_left; + if (new->bno + new->length > busyp->bno) + match = busyp->tid == new->tid ? 1 : -1; + } else if (new->bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + rbp = &(*rbp)->rb_right; + if (bno < busyp->bno + busyp->length) + match = busyp->tid == new->tid ? 1 : -1; + } else { + match = busyp->tid == new->tid ? 1 : -1; + break; + } + } + if (match < 0) { + /* overlap marked busy in different transaction */ + spin_unlock(&pag->pagb_lock); + xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); + goto restart; + } + if (match > 0) { /* - * The busy list is full! Since it is now not possible to - * track the free block, make this a synchronous transaction - * to insure that the block is not reused before this - * transaction commits. + * overlap marked busy in same transaction. Update if exact + * start block match, otherwise combine the busy extents into + * a single range. */ - xfs_trans_set_sync(tp); - } + if (busyp->bno == new->bno) { + busyp->length = max(busyp->length, new->length); + spin_unlock(&pag->pagb_lock); + ASSERT(tp->t_flags & XFS_TRANS_SYNC); + xfs_perag_put(pag); + kmem_free(new); + return; + } + rb_erase(&busyp->rb_node, &pag->pagb_tree); + new->length = max(busyp->bno + busyp->length, + new->bno + new->length) - + min(busyp->bno, new->bno); + new->bno = min(busyp->bno, new->bno); + } else + busyp = NULL; + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); + kmem_free(busyp); } -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - int idx) +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +static int +xfs_alloc_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { struct xfs_perag *pag; - xfs_perag_busy_t *list; + struct rb_node *rbp; + struct xfs_busy_extent *busyp; + int match = 0; - ASSERT(idx < XFS_PAGB_NUM_SLOTS); - pag = xfs_perag_get(tp->t_mountp, agno); + pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); - list = pag->pagb_list; - trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); - - if (list[idx].busy_tp == tp) { - list[idx].busy_tp = NULL; - pag->pagb_count--; + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } } - spin_unlock(&pag->pagb_lock); + trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); xfs_perag_put(pag); + return match; } - -/* - * If we find the extent in the busy list, force the log out to get the - * extent out of the busy list so the caller can use it straight away. - */ -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct xfs_busy_extent *busyp) { struct xfs_perag *pag; - xfs_perag_busy_t *bsy; - xfs_agblock_t uend, bend; - xfs_lsn_t lsn = 0; - int cnt; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - cnt = pag->pagb_count; + trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, + busyp->length); - /* - * search pagb_list for this slot, skipping open slots. We have to - * search the entire array as there may be multiple overlaps and - * we have to get the most recent LSN for the log force to push out - * all the transactions that span the range. - */ - uend = bno + len - 1; - for (cnt = 0; cnt < pag->pagb_count; cnt++) { - bsy = &pag->pagb_list[cnt]; - if (!bsy->busy_tp) - continue; + ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, + busyp->length) == 1); - bend = bsy->busy_start + bsy->busy_length - 1; - if (bno > bend || uend < bsy->busy_start) - continue; + list_del_init(&busyp->list); - /* (start1,length1) within (start2, length2) */ - if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) - lsn = bsy->busy_tp->t_commit_lsn; - } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + rb_erase(&busyp->rb_node, &pag->pagb_tree); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); - trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); - /* - * If a block was found, force the log through the LSN of the - * transaction that freed the block - */ - if (lsn) - xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC); + kmem_free(busyp); } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 599bffa..6d05199 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -22,6 +22,7 @@ struct xfs_buf; struct xfs_mount; struct xfs_perag; struct xfs_trans; +struct xfs_busy_extent; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. @@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ void -xfs_alloc_mark_busy(xfs_trans_t *tp, +xfs_alloc_busy_insert(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - int idx); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b726e10..83f4942 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -134,7 +134,7 @@ xfs_allocbt_free_block( * disk. If a busy block is allocated, the iclog is pushed up to the * LSN that freed the block. */ - xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 240340a..02a8098 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -64,7 +64,7 @@ xfs_buf_item_log_debug( nbytes = last - first + 1; bfset(bip->bli_logged, first, nbytes); for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLI_SHIFT; + chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); wordp = &(bip->bli_format.blf_data_map[word_num]); @@ -166,7 +166,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); return 1; } @@ -197,9 +197,9 @@ xfs_buf_item_size( } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; - } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != - (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + - XFS_BLI_CHUNK)) { + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { last_bit = next_bit; nvecs++; } else { @@ -254,6 +254,20 @@ xfs_buf_item_format( vecp++; nvecs = 1; + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(&bip->bli_item))) + bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -261,7 +275,7 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); bip->bli_format.blf_size = nvecs; return; } @@ -294,28 +308,28 @@ xfs_buf_item_format( * keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + - XFS_BLI_CHUNK)) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != + (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + XFS_BLF_CHUNK)) { + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; /* You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary @@ -341,10 +355,15 @@ xfs_buf_item_format( } /* - * This is called to pin the buffer associated with the buf log - * item in memory so it cannot be written out. Simply call bpin() - * on the buffer to do this. + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. Simply call bpin() on the buffer to do this. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. */ + STATIC void xfs_buf_item_pin( xfs_buf_log_item_t *bip) @@ -356,6 +375,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); + atomic_inc(&bip->bli_refcount); trace_xfs_buf_item_pin(bip); xfs_bpin(bp); } @@ -393,7 +413,7 @@ xfs_buf_item_unpin( ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); /* @@ -489,20 +509,23 @@ xfs_buf_item_trylock( } /* - * Release the buffer associated with the buf log item. - * If there is no dirty logged data associated with the - * buffer recorded in the buf log item, then free the - * buf log item and remove the reference to it in the - * buffer. + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. + * + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. * - * This call ignores the recursion count. It is only called - * when the buffer should REALLY be unlocked, regardless - * of the recursion count. + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. * - * If the XFS_BLI_HOLD flag is set in the buf log item, then - * free the log item if necessary but do not unlock the buffer. - * This is for support of xfs_trans_bhold(). Make sure the - * XFS_BLI_HOLD field is cleared if we don't free the item. + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. */ STATIC void xfs_buf_item_unlock( @@ -514,73 +537,54 @@ xfs_buf_item_unlock( bp = bip->bli_buf; - /* - * Clear the buffer's association with this transaction. - */ + /* Clear the buffer's association with this transaction. */ XFS_BUF_SET_FSPRIVATE2(bp, NULL); /* - * If this is a transaction abort, don't return early. - * Instead, allow the brelse to happen. - * Normally it would be done for stale (cancelled) buffers - * at unpin time, but we'll never go through the pin/unpin - * cycle if we abort inside commit. + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. */ aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; /* - * If the buf item is marked stale, then don't do anything. - * We'll unlock the buffer and free the buf item when the - * buffer is unpinned for the last time. + * Before possibly freeing the buf item, determine if we should + * release the buffer at the end of this routine. */ - if (bip->bli_flags & XFS_BLI_STALE) { - bip->bli_flags &= ~XFS_BLI_LOGGED; - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - if (!aborted) - return; - } + hold = bip->bli_flags & XFS_BLI_HOLD; + + /* Clear the per transaction state. */ + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); /* - * Drop the transaction's reference to the log item if - * it was not logged as part of the transaction. Otherwise - * we'll drop the reference in xfs_buf_item_unpin() when - * the transaction is really through with the buffer. + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. */ - if (!(bip->bli_flags & XFS_BLI_LOGGED)) { - atomic_dec(&bip->bli_refcount); - } else { - /* - * Clear the logged flag since this is per - * transaction state. - */ - bip->bli_flags &= ~XFS_BLI_LOGGED; + if (bip->bli_flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); + return; + } } - /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. - */ - hold = bip->bli_flags & XFS_BLI_HOLD; trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it. - * Otherwise, if XFS_BLI_HOLD is set clear it. + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. */ if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) { + bip->bli_format.blf_map_size)) xfs_buf_item_relse(bp); - } else if (hold) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } + else + atomic_dec(&bip->bli_refcount); - /* - * Release the buffer if XFS_BLI_HOLD was not set. - */ - if (!hold) { + if (!hold) xfs_buf_relse(bp); - } } /* @@ -717,12 +721,12 @@ xfs_buf_item_init( } /* - * chunks is the number of XFS_BLI_CHUNK size pieces + * chunks is the number of XFS_BLF_CHUNK size pieces * the buffer can be divided into. Make sure not to * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); + chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, @@ -790,8 +794,8 @@ xfs_buf_item_log( /* * Convert byte offsets to bit numbers. */ - first_bit = first >> XFS_BLI_SHIFT; - last_bit = last >> XFS_BLI_SHIFT; + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; /* * Calculate the total number of bits to be set. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index df44545..f20bb47 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format { * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLI_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF 0x1 /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLI_CANCEL 0x2 +#define XFS_BLF_CANCEL 0x2 /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLI_UDQUOT_BUF 0x4 -#define XFS_BLI_PDQUOT_BUF 0x8 -#define XFS_BLI_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF 0x4 +#define XFS_BLF_PDQUOT_BUF 0x8 +#define XFS_BLF_GDQUOT_BUF 0x10 -#define XFS_BLI_CHUNK 128 -#define XFS_BLI_SHIFT 7 +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 #define BIT_TO_WORD_SHIFT 5 #define NBWORD (NBBY * sizeof(unsigned int)) @@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format { #define XFS_BLI_LOGGED 0x08 #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format { { XFS_BLI_STALE, "STALE" }, \ { XFS_BLI_LOGGED, "LOGGED" }, \ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ - { XFS_BLI_STALE_INODE, "STALE_INODE" } + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" } #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ef96175..047b8a8 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) va_list ap; #ifdef DEBUG - xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); #endif if (xfs_panic_mask && (xfs_panic_mask & panic_tag) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3038dd5..5215abc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); -STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, uint flags); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); @@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log, STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket); - -/* local ticket functions */ -STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int count, - char clientid, - uint flags); - #if defined(DEBUG) STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); STATIC void xlog_verify_grant_head(xlog_t *log, int equals); @@ -360,6 +349,15 @@ xfs_log_reserve( ASSERT(flags & XFS_LOG_PERM_RESERV); internal_ticket = *ticket; + /* + * this is a new transaction on the ticket, so we need to + * change the transaction ID so that the next transaction has a + * different TID in the log. Just add one to the existing tid + * so that we can see chains of rolling transactions in the log + * easily. + */ + internal_ticket->t_tid++; + trace_xfs_log_reserve(log, internal_ticket); xlog_grant_push_ail(mp, internal_ticket->t_unit_res); @@ -367,7 +365,8 @@ xfs_log_reserve( } else { /* may sleep if need to allocate more tickets */ internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, - client, flags); + client, flags, + KM_SLEEP|KM_MAYFAIL); if (!internal_ticket) return XFS_ERROR(ENOMEM); internal_ticket->t_trans_type = t_type; @@ -452,6 +451,13 @@ xfs_log_mount( /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + return 0; out_destroy_ail: @@ -658,6 +664,10 @@ xfs_log_item_init( item->li_ailp = mp->m_ail; item->li_type = type; item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); } /* @@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp, *iclogp = log->l_iclog; /* complete ring */ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; return log; out_free_iclog: @@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log) xlog_in_core_t *iclog, *next_iclog; int i; + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { sv_destroy(&iclog->ic_force_wait); @@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log, * print out info relating to regions written which consume * the reservation */ -STATIC void -xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) { uint i; uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); @@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); } + + xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, + "xfs_log_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } /* @@ -1865,7 +1886,7 @@ xlog_write_copy_finish( * we don't update ic_offset until the end when we know exactly how many * bytes have been written out. */ -STATIC int +int xlog_write( struct log *log, struct xfs_log_vec *log_vector, @@ -1889,22 +1910,26 @@ xlog_write( *start_lsn = 0; len = xlog_write_calc_vec_length(ticket, log_vector); - if (ticket->t_curr_res < len) { - xlog_print_tic_res(log->l_mp, ticket); -#ifdef DEBUG - xlog_panic( - "xfs_log_write: reservation ran out. Need to up reservation"); -#else - /* Customer configurable panic */ - xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp, - "xfs_log_write: reservation ran out. Need to up reservation"); + if (log->l_cilp) { + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); - /* If we did not panic, shutdown the filesystem */ - xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE); -#endif - } + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + } else + ticket->t_curr_res -= len; - ticket->t_curr_res -= len; + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); index = 0; lv = log_vector; @@ -3000,6 +3025,8 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); + xlog_cil_push(log, 1); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -3149,6 +3176,12 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); + if (log->l_cilp) { + lsn = xlog_cil_push_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + } + try_again: spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -3313,22 +3346,30 @@ xfs_log_ticket_get( return ticket; } +xlog_tid_t +xfs_log_get_trans_ident( + struct xfs_trans *tp) +{ + return tp->t_ticket->t_tid; +} + /* * Allocate and initialise a new log ticket. */ -STATIC xlog_ticket_t * +xlog_ticket_t * xlog_ticket_alloc( struct log *log, int unit_bytes, int cnt, char client, - uint xflags) + uint xflags, + int alloc_flags) { struct xlog_ticket *tic; uint num_headers; int iclog_space; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); if (!tic) return NULL; @@ -3647,6 +3688,11 @@ xlog_state_ioerror( * c. nothing new gets queued up after (a) and (b) are done. * d. if !logerror, flush the iclogs to disk, then seal them off * for business. + * + * Note: for delayed logging the !logerror case needs to flush the regions + * held in memory out to the iclogs before flushing them to disk. This needs + * to be done before the log is marked as shutdown, otherwise the flush to the + * iclogs will fail. */ int xfs_log_force_umount( @@ -3680,6 +3726,16 @@ xfs_log_force_umount( return 1; } retval = 0; + + /* + * Flush the in memory commit item list before marking the log as + * being shut down. We need to do it in this order to ensure all the + * completed transactions are flushed to disk with the xfs_log_force() + * call below. + */ + if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + xlog_cil_push(log, 1); + /* * We must hold both the GRANT lock and the LOG lock, * before we mark the filesystem SHUTDOWN and wake diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 229d1f3..04c78e6 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -19,7 +19,6 @@ #define __XFS_LOG_H__ /* get lsn fields */ - #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) #define BLOCK_LSN(lsn) ((uint)(lsn)) @@ -114,6 +113,9 @@ struct xfs_log_vec { struct xfs_log_vec *lv_next; /* next lv in build list */ int lv_niovecs; /* number of iovecs in lv */ struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_buf_len; /* size of formatted buffer */ }; /* @@ -134,6 +136,7 @@ struct xlog_in_core; struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; +struct xfs_trans; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, @@ -187,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp); void xlog_iodone(struct xfs_buf *); -struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); +struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); +xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); + +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, int flags); +bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); + #endif diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c new file mode 100644 index 0000000..bb17cc0 --- /dev/null +++ b/fs/xfs/xfs_log_cil.c @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_alloc.h" + +/* + * Perform initial CIL structure initialisation. If the CIL is not + * enabled in this filesystem, ensure the log->l_cilp is null so + * we can check this conditional to determine if we are doing delayed + * logging or not. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + log->l_cilp = NULL; + if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) + return 0; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (!log->l_cilp) + return; + + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + +/* + * Allocate a new ticket. Failing to get a new ticket makes it really hard to + * recover, so we don't allow failure here. Also, we allocate in a context that + * we don't want to be issuing transactions from, so we need to tell the + * allocation code this as well. + * + * We don't reserve any space for the ticket - we are going to steal whatever + * space we require from transactions as they commit. To ensure we reserve all + * the space required, we need to set the current reservation of the ticket to + * zero so that we know to steal the initial transaction overhead from the + * first transaction commit. + */ +static struct xlog_ticket * +xlog_cil_ticket_alloc( + struct log *log) +{ + struct xlog_ticket *tic; + + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, + KM_SLEEP|KM_NOFS); + tic->t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct log *log) +{ + if (!log->l_cilp) + return; + + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; + log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, + log->l_curr_block); +} + +/* + * Insert the log item into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we addded to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + * + * If this is the first time the item is being placed into the CIL in this + * context, pin it so it can't be written to disk until the CIL is flushed to + * the iclog and the iclog written to disk. + */ +static void +xlog_cil_insert( + struct log *log, + struct xlog_ticket *ticket, + struct xfs_log_item *item, + struct xfs_log_vec *lv) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *old = lv->lv_item->li_lv; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + int len; + int diff_iovecs; + int iclog_space; + + if (old) { + /* existing lv on log item, space used is a delta */ + ASSERT(!list_empty(&item->li_cil)); + ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + + len = lv->lv_buf_len - old->lv_buf_len; + diff_iovecs = lv->lv_niovecs - old->lv_niovecs; + kmem_free(old->lv_buf); + kmem_free(old); + } else { + /* new lv, must pin the log item */ + ASSERT(!lv->lv_item->li_lv); + ASSERT(list_empty(&item->li_cil)); + + len = lv->lv_buf_len; + diff_iovecs = lv->lv_niovecs; + IOP_PIN(lv->lv_item); + + } + len += diff_iovecs * sizeof(xlog_op_header_t); + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + spin_lock(&cil->xc_cil_lock); + list_move_tail(&item->li_cil, &cil->xc_cil); + ctx->nvecs += diff_iovecs; + + /* + * If this is the first time the item is being committed to the CIL, + * store the sequence number on the log item so we can tell + * in future commits whether this is the first checkpoint the item is + * being committed into. + */ + if (!item->li_seq) + item->li_seq = ctx->sequence; + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + /* first commit in checkpoint, steal the header reservation */ + ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + ticket->t_curr_res -= hdrs; + ASSERT(ticket->t_curr_res >= len); + } + ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_format_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn) +{ + struct xfs_log_vec *lv; + + if (start_lsn) + *start_lsn = log->l_cilp->xc_ctx->sequence; + + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) { + void *ptr; + int index; + int len = 0; + + /* build the vector array and calculate it's length */ + IOP_FORMAT(lv->lv_item, lv->lv_iovecp); + for (index = 0; index < lv->lv_niovecs; index++) + len += lv->lv_iovecp[index].i_len; + + lv->lv_buf_len = len; + lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); + ptr = lv->lv_buf; + + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + + memcpy(ptr, vec->i_addr, vec->i_len); + vec->i_addr = ptr; + ptr += vec->i_len; + } + ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + + xlog_cil_insert(log, ticket, lv->lv_item, lv); + } +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv->lv_buf); + kmem_free(lv); + lv = next; + } +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + if (XLOG_FORCED_SHUTDOWN(log)) { + xlog_cil_free_logvec(log_vector); + return XFS_ERROR(EIO); + } + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); + return 0; +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_log_vec *lv; + int abortflag = abort ? XFS_LI_ABORTED : 0; + struct xfs_busy_extent *busyp, *n; + + /* unpin all the log items */ + for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { + xfs_trans_item_committed(lv->lv_item, ctx->start_lsn, + abortflag); + } + + list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) + xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); + + spin_lock(&ctx->cil->xc_cil_lock); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_cil_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If the push_now flag is not set, + * then it is a background flush and so we can chose to ignore it. + */ +int +xlog_cil_push( + struct log *log, + int push_now) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_lv; + int num_iovecs; + int len; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + + if (!cil) + return 0; + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + /* lock out transaction commit, but don't block on background push */ + if (!down_write_trylock(&cil->xc_ctx_lock)) { + if (!push_now) + goto out_free_ticket; + down_write(&cil->xc_ctx_lock); + } + ctx = cil->xc_ctx; + + /* check if we've anything to push */ + if (list_empty(&cil->xc_cil)) + goto out_skip; + + /* check for spurious background flush */ + if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_lv = 0; + num_iovecs = 0; + len = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + int i; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + + num_lv++; + num_iovecs += lv->lv_niovecs; + for (i = 0; i < lv->lv_niovecs; i++) + len += lv->lv_iovecp[i].i_len; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + */ + spin_lock(&cil->xc_cil_lock); + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = (xfs_caddr_t)&thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for own own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + } + spin_unlock(&cil->xc_cil_lock); + + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (error || commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_cil_lock); + ctx->commit_lsn = commit_lsn; + sv_broadcast(&cil->xc_commit_wait); + spin_unlock(&cil->xc_cil_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); +out_free_ticket: + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return XFS_ERROR(EIO); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + * + * XXX: Initially, just push the CIL unconditionally and return whatever + * commit lsn is there. It'll be empty, so this is broken for now. + */ +xfs_lsn_t +xlog_cil_push_lsn( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + +restart: + down_write(&cil->xc_ctx_lock); + ASSERT(push_seq <= cil->xc_ctx->sequence); + + /* check to see if we need to force out the current context */ + if (push_seq == cil->xc_ctx->sequence) { + up_write(&cil->xc_ctx_lock); + xlog_cil_push(log, 1); + goto restart; + } + + /* + * See if we can find a previous sequence still committing. + * We can drop the flush lock as soon as we have the cil lock + * because we are now only comparing contexts protected by + * the cil lock. + * + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ + spin_lock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + if (ctx->sequence > push_seq) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + if (ctx->sequence != push_seq) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + spin_unlock(&cil->xc_cil_lock); + return commit_lsn; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) + return false; + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 9cf6951..8c07261 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being shutdown */ -typedef __uint32_t xlog_tid_t; - #ifdef __KERNEL__ /* @@ -379,6 +377,99 @@ typedef struct xlog_in_core { } xlog_in_core_t; /* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + xfs_log_callback_t log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct log *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + struct xfs_cil_ctx *xc_ctx; + struct rw_semaphore xc_ctx_lock; + struct list_head xc_committing; + sv_t xc_commit_wait; +}; + +/* + * The amount of log space we should the CIL to aggregate is difficult to size. + * Whatever we chose we have to make we can get a reservation for the log space + * effectively, that it is large enough to capture sufficient relogging to + * reduce log buffer IO significantly, but it is not too large for the log or + * induces too much latency when writing out through the iclogs. We track both + * space consumed and the number of vectors in the checkpoint context, so we + * need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can basically make up arbitrary limits for the + * checkpoint size so long as they don't violate any other size rules. Hence + * the initial maximum size for the checkpoint transaction will be set to a + * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit + * right now based on the latency of writing out a large amount of data through + * the circular iclog buffers. + */ + +#define XLOG_CIL_SPACE_LIMIT(log) \ + (min((log->l_logsize >> 2), (8 * 1024 * 1024))) + +/* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean @@ -388,6 +479,7 @@ typedef struct log { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ @@ -438,14 +530,17 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) - /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern kmem_zone_t *xfs_log_ticket_zone; +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, + int count, char client, uint xflags, + int alloc_flags); + static inline void xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) @@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) *off += bytes; } +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int xlog_write(struct log *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, uint flags); + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct log *log); +void xlog_cil_init_post_recovery(struct log *log); +void xlog_cil_destroy(struct log *log); + +int xlog_cil_push(struct log *log, int push_now); +xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); + /* * Unmount record type is used as a pseudo transaction type for the ticket. * It's value must be outside the range of XFS_TRANS_* values. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0de08e3..14a69ae 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1576,7 +1576,7 @@ xlog_recover_reorder_trans( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); list_move(&item->ri_list, &trans->r_itemq); @@ -1638,7 +1638,7 @@ xlog_recover_do_buffer_pass1( /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLI_CANCEL)) { + if (!(flags & XFS_BLF_CANCEL)) { trace_xfs_log_recover_buf_not_cancel(log, buf_f); return; } @@ -1696,7 +1696,7 @@ xlog_recover_do_buffer_pass1( * Check to see whether the buffer being recovered has a corresponding * entry in the buffer cancel record table. If it does then return 1 * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement + * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement * the refcount on the entry in the table and remove it from the table * if this is the last reference. * @@ -1721,7 +1721,7 @@ xlog_check_buffer_cancelled( * There is nothing in the table built in pass one, * so this buffer must not be cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1733,7 +1733,7 @@ xlog_check_buffer_cancelled( * There is no corresponding entry in the table built * in pass one, so this buffer has not been cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1752,7 +1752,7 @@ xlog_check_buffer_cancelled( * one in the table and remove it if this is the * last reference. */ - if (flags & XFS_BLI_CANCEL) { + if (flags & XFS_BLF_CANCEL) { bcp->bc_refcount--; if (bcp->bc_refcount == 0) { if (prevp == NULL) { @@ -1772,7 +1772,7 @@ xlog_check_buffer_cancelled( * We didn't find a corresponding entry in the table, so * return 0 so that the buffer is NOT cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1874,8 +1874,8 @@ xlog_recover_do_inode_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLI_SHIFT; - reg_buf_bytes = nbits << XFS_BLI_SHIFT; + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } @@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer( } ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); /* @@ -1955,9 +1955,9 @@ xlog_recover_do_reg_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); + ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1966,7 +1966,7 @@ xlog_recover_do_reg_buffer( */ error = 0; if (buf_f->blf_flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { cmn_err(CE_ALERT, "XFS: NULL dquot in %s.", __func__); @@ -1987,9 +1987,9 @@ xlog_recover_do_reg_buffer( } memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLI_SHIFT), /* dest */ + (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLI_SHIFT); /* length */ + nbits<<XFS_BLF_SHIFT); /* length */ next: i++; bit += nbits; @@ -2148,11 +2148,11 @@ xlog_recover_do_dquot_buffer( } type = 0; - if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* * This type of quotas was turned off, so ignore this buffer @@ -2173,7 +2173,7 @@ xlog_recover_do_dquot_buffer( * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLI_CANCEL bit set to indicate that previous copies + * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed @@ -2207,7 +2207,7 @@ xlog_recover_do_buffer_trans( if (pass == XLOG_RECOVER_PASS1) { /* * In this pass we're only looking for buf items - * with the XFS_BLI_CANCEL bit set. + * with the XFS_BLF_CANCEL bit set. */ xlog_recover_do_buffer_pass1(log, buf_f); return 0; @@ -2244,7 +2244,7 @@ xlog_recover_do_buffer_trans( mp = log->l_mp; buf_flags = XBF_LOCK; - if (!(flags & XFS_BLI_INODE_BUF)) + if (!(flags & XFS_BLF_INODE_BUF)) buf_flags |= XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); @@ -2257,10 +2257,10 @@ xlog_recover_do_buffer_trans( } error = 0; - if (flags & XFS_BLI_INODE_BUF) { + if (flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f); diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h index 75d7492..1c55ccb 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/xfs_log_recover.h @@ -28,7 +28,7 @@ #define XLOG_RHASH(tid) \ ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) -#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9ff48a16..1d2c7ee 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -268,6 +268,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index be578ec..ce558ef 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -44,6 +44,7 @@ #include "xfs_trans_priv.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" +#include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; @@ -243,9 +244,8 @@ _xfs_trans_alloc( tp->t_type = type; tp->t_mountp = mp; tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(tp->t_items)); - XFS_LBC_INIT(&(tp->t_busy)); + INIT_LIST_HEAD(&tp->t_busy); return tp; } @@ -255,8 +255,13 @@ _xfs_trans_alloc( */ STATIC void xfs_trans_free( - xfs_trans_t *tp) + struct xfs_trans *tp) { + struct xfs_busy_extent *busyp, *n; + + list_for_each_entry_safe(busyp, n, &tp->t_busy, list) + xfs_alloc_busy_clear(tp->t_mountp, busyp); + atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); @@ -285,9 +290,8 @@ xfs_trans_dup( ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; ntp->t_items_free = XFS_LIC_NUM_SLOTS; - ntp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(ntp->t_items)); - XFS_LBC_INIT(&(ntp->t_busy)); + INIT_LIST_HEAD(&ntp->t_busy); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); @@ -423,7 +427,6 @@ undo_blocks: return error; } - /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. @@ -652,7 +655,7 @@ xfs_trans_apply_sb_deltas( * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. */ -STATIC void +void xfs_trans_unreserve_and_mod_sb( xfs_trans_t *tp) { @@ -880,7 +883,7 @@ xfs_trans_fill_vecs( * they could be immediately flushed and we'd have to race with the flusher * trying to pull the item from the AIL as we add it. */ -static void +void xfs_trans_item_committed( struct xfs_log_item *lip, xfs_lsn_t commit_lsn, @@ -930,26 +933,6 @@ xfs_trans_item_committed( IOP_UNPIN(lip); } -/* Clear all the per-AG busy list items listed in this transaction */ -static void -xfs_trans_clear_busy_extents( - struct xfs_trans *tp) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i; - - for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) { - i = 0; - for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { - if (XFS_LBC_ISFREE(lbcp, i)) - continue; - xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx); - } - } - xfs_trans_free_busy(tp); -} - /* * This is typically called by the LM when a transaction has been fully * committed to disk. It needs to unpin the items which have @@ -984,7 +967,6 @@ xfs_trans_committed( kmem_free(licp); } - xfs_trans_clear_busy_extents(tp); xfs_trans_free(tp); } @@ -1012,8 +994,7 @@ xfs_trans_uncommit( xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); xfs_trans_free(tp); } @@ -1075,6 +1056,8 @@ xfs_trans_commit_iclog( *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); tp->t_commit_lsn = *commit_lsn; + trace_xfs_trans_commit_lsn(tp); + if (nvec > XFS_TRANS_LOGVEC_COUNT) kmem_free(log_vector); @@ -1161,6 +1144,93 @@ xfs_trans_commit_iclog( return xfs_log_release_iclog(mp, commit_iclog); } +/* + * Walk the log items and allocate log vector structures for + * each item large enough to fit all the vectors they require. + * Note that this format differs from the old log vector format in + * that there is no transaction header in these log vectors. + */ +STATIC struct xfs_log_vec * +xfs_trans_alloc_log_vecs( + xfs_trans_t *tp) +{ + xfs_log_item_desc_t *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; + + lidp = xfs_trans_first_item(tp); + + /* Bail out if we didn't find a log item. */ + if (!lidp) { + ASSERT(0); + return NULL; + } + + while (lidp != NULL) { + struct xfs_log_vec *new_lv; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + /* Skip items that do not have any vectors for writing */ + lidp->lid_size = IOP_SIZE(lidp->lid_item); + if (!lidp->lid_size) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + new_lv = kmem_zalloc(sizeof(*new_lv) + + lidp->lid_size * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = lidp->lid_size; + new_lv->lv_item = lidp->lid_item; + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; + lidp = xfs_trans_next_item(tp, lidp); + } + + return ret_lv; +} + +static int +xfs_trans_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct xfs_log_vec *log_vector; + int error; + + /* + * Get each log item to allocate a vector structure for + * the log item to to pass to the log write code. The + * CIL commit code will format the vector and save it away. + */ + log_vector = xfs_trans_alloc_log_vecs(tp); + if (!log_vector) + return ENOMEM; + + error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); + if (error) + return error; + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + /* xfs_trans_free_items() unlocks them first */ + xfs_trans_free_items(tp, *commit_lsn, 0); + xfs_trans_free(tp); + return 0; +} /* * xfs_trans_commit @@ -1221,7 +1291,11 @@ _xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + if (mp->m_flags & XFS_MOUNT_DELAYLOG) + error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); + else + error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + if (error == ENOMEM) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); error = XFS_ERROR(EIO); @@ -1259,8 +1333,7 @@ out_unreserve: error = XFS_ERROR(EIO); } current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); @@ -1338,8 +1411,7 @@ xfs_trans_cancel( /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); xfs_trans_free(tp); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c62beee..8c69e78 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -106,7 +106,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFSRT_FREE 39 #define XFS_TRANS_SWAPEXT 40 #define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_TYPE_MAX 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_TYPE_MAX 42 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -148,6 +149,7 @@ typedef struct xfs_trans_header { { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ { XFS_TRANS_DUMMY1, "DUMMY1" }, \ { XFS_TRANS_DUMMY2, "DUMMY2" }, \ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } @@ -813,6 +815,7 @@ struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_dquot_acct; +struct xfs_busy_extent; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -828,6 +831,11 @@ typedef struct xfs_log_item { /* buffer item iodone */ /* callback func */ struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 @@ -872,34 +880,6 @@ typedef struct xfs_item_ops { #define XFS_ITEM_PUSHBUF 3 /* - * This structure is used to maintain a list of block ranges that have been - * freed in the transaction. The ranges are listed in the perag[] busy list - * between when they're freed and the transaction is committed to disk. - */ - -typedef struct xfs_log_busy_slot { - xfs_agnumber_t lbc_ag; - ushort lbc_idx; /* index in perag.busy[] */ -} xfs_log_busy_slot_t; - -#define XFS_LBC_NUM_SLOTS 31 -typedef struct xfs_log_busy_chunk { - struct xfs_log_busy_chunk *lbc_next; - uint lbc_free; /* free slots bitmask */ - ushort lbc_unused; /* first unused */ - xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; -} xfs_log_busy_chunk_t; - -#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) -#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) - -#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) -#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) -#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) -#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) -#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) - -/* * This is the type of function which can be given to xfs_trans_callback() * to be called upon the transaction's commit to disk. */ @@ -950,8 +930,7 @@ typedef struct xfs_trans { unsigned int t_items_free; /* log item descs free */ xfs_log_item_chunk_t t_items; /* first log item desc chunk */ xfs_trans_header_t t_header; /* header for in-log trans */ - unsigned int t_busy_free; /* busy descs free */ - xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ + struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; @@ -1025,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *, void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); extern kmem_zone_t *xfs_trans_zone; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 9cd8090..63d81a2 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -114,7 +114,7 @@ _xfs_trans_bjoin( xfs_buf_item_init(bp, tp->t_mountp); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; @@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); /* @@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); @@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; @@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; + bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; } lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); @@ -762,8 +762,8 @@ xfs_trans_binval( ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(lidp->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; @@ -774,7 +774,7 @@ xfs_trans_binval( * in the buf log item. The STALE flag will be used in * xfs_buf_item_unpin() to determine if it should clean up * when the last reference to the buf item is given up. - * We set the XFS_BLI_CANCEL flag in the buf log format structure + * We set the XFS_BLF_CANCEL flag in the buf log format structure * and log the buf item. This will be used at recovery time * to determine that copies of the buffer in the log before * this should not be replayed. @@ -792,9 +792,9 @@ xfs_trans_binval( XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_STALE(bp); bip->bli_flags |= XFS_BLI_STALE; - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLI_CANCEL; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->bli_format.blf_flags |= XFS_BLF_CANCEL; memset((char *)(bip->bli_format.blf_data_map), 0, (bip->bli_format.blf_map_size * sizeof(uint))); lidp->lid_flags |= XFS_LID_DIRTY; @@ -802,16 +802,16 @@ xfs_trans_binval( } /* - * This call is used to indicate that the buffer contains on-disk - * inodes which must be handled specially during recovery. They - * require special handling because only the di_next_unlinked from - * the inodes in the buffer should be recovered. The rest of the - * data in the buffer is logged via the inodes themselves. + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. * - * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log - * format structure so that we'll know what to do at recovery time. + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. */ -/* ARGSUSED */ void xfs_trans_inode_buf( xfs_trans_t *tp, @@ -826,7 +826,7 @@ xfs_trans_inode_buf( bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; + bip->bli_flags |= XFS_BLI_INODE_BUF; } /* @@ -908,9 +908,9 @@ xfs_trans_dquot_buf( ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT(type == XFS_BLI_UDQUOT_BUF || - type == XFS_BLI_PDQUOT_BUF || - type == XFS_BLI_GDQUOT_BUF); + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index eb3fc57..f11d37d 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) void xfs_trans_free_items( xfs_trans_t *tp, + xfs_lsn_t commit_lsn, int flags) { xfs_log_item_chunk_t *licp; @@ -311,7 +312,7 @@ xfs_trans_free_items( * Special case the embedded chunk so we don't free it below. */ if (!xfs_lic_are_all_free(licp)) { - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); xfs_lic_all_free(licp); licp->lic_unused = 0; } @@ -322,7 +323,7 @@ xfs_trans_free_items( */ while (licp != NULL) { ASSERT(!xfs_lic_are_all_free(licp)); - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); next_licp = licp->lic_next; kmem_free(licp); licp = next_licp; @@ -438,112 +439,3 @@ xfs_trans_unlock_chunk( return freed; } - - -/* - * This is called to add the given busy item to the transaction's - * list of busy items. It must find a free busy item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to busy descriptor used to point - * to the new busy entry. The log busy entry will now point to its new - * descriptor with its ???? field. - */ -xfs_log_busy_slot_t * -xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_busy_free == 0) { - lbcp = (xfs_log_busy_chunk_t*) - kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP); - ASSERT(lbcp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - XFS_LBC_INIT(lbcp); - XFS_LBC_CLAIM(lbcp, 0); - lbcp->lbc_unused = 1; - lbsp = XFS_LBC_SLOT(lbcp, 0); - - /* - * Link in the new chunk and update the free count. - */ - lbcp->lbc_next = tp->t_busy.lbc_next; - tp->t_busy.lbc_next = lbcp; - tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - if (XFS_LBC_VACANCY(lbcp)) { - if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) { - i = lbcp->lbc_unused; - break; - } else { - /* out-of-order vacancy */ - cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp); - ASSERT(0); - } - } - lbcp = lbcp->lbc_next; - } - ASSERT(lbcp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - XFS_LBC_CLAIM(lbcp, i); - if (lbcp->lbc_unused <= i) { - lbcp->lbc_unused = i + 1; - } - lbsp = XFS_LBC_SLOT(lbcp, i); - tp->t_busy_free--; - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; -} - - -/* - * xfs_trans_free_busy - * Free all of the busy lists from a transaction - */ -void -xfs_trans_free_busy(xfs_trans_t *tp) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_chunk_t *lbcq; - - lbcp = tp->t_busy.lbc_next; - while (lbcp != NULL) { - lbcq = lbcp->lbc_next; - kmem_free(lbcp); - lbcp = lbcq; - } - - XFS_LBC_INIT(&tp->t_busy); - tp->t_busy.lbc_unused = 0; -} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 73e2ad3..c6e4f2c8 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, struct xfs_log_item_desc *); -void xfs_trans_free_items(struct xfs_trans *, int); -void xfs_trans_unlock_items(struct xfs_trans *, - xfs_lsn_t); -void xfs_trans_free_busy(xfs_trans_t *tp); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); + +void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); + +void xfs_trans_item_committed(struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, int aborted); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); /* * AIL traversal cursor. diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index b099045..3207752 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ +typedef __uint32_t xlog_tid_t; /* transaction ID type */ + /* * These types are 64 bits on disk but are either 32 or 64 bits in memory. * Disk based types: |