diff options
author | Frederic Weisbecker <fweisbec@gmail.com> | 2010-08-09 01:49:58 +0200 |
---|---|---|
committer | Frederic Weisbecker <fweisbec@gmail.com> | 2010-08-09 02:14:15 +0200 |
commit | d9a145fb6e5f37b9903dea8371ab5c3e34e8e2d1 (patch) | |
tree | e2b4bb46fa00f0ad20447e40dba6fb21a4ae0815 /fs | |
parent | c9243f5bdd6637b2bb7dc254b54d9edf957ef17e (diff) | |
parent | 45d7f32c7a43cbb9592886d38190e379e2eb2226 (diff) | |
download | op-kernel-dev-d9a145fb6e5f37b9903dea8371ab5c3e34e8e2d1.zip op-kernel-dev-d9a145fb6e5f37b9903dea8371ab5c3e34e8e2d1.tar.gz |
Merge commit 'linus/master' into bkl/core
Merge reason: The staging tree has introduced the easycap
driver lately. We need the latest updates to pushdown the
bkl in its ioctl helper.
Diffstat (limited to 'fs')
320 files changed, 9330 insertions, 7912 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile index 1a940ec..91fba02 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -8,6 +8,8 @@ obj-$(CONFIG_9P_FS) := 9p.o vfs_dir.o \ vfs_dentry.o \ v9fs.o \ - fid.o + fid.o \ + xattr.o \ + xattr_user.o 9p-$(CONFIG_9P_FSCACHE) += cache.o diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 7317b39..3585636 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -97,6 +97,34 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any) return ret; } +/* + * We need to hold v9ses->rename_sem as long as we hold references + * to returned path array. Array element contain pointers to + * dentry names. + */ +static int build_path_from_dentry(struct v9fs_session_info *v9ses, + struct dentry *dentry, char ***names) +{ + int n = 0, i; + char **wnames; + struct dentry *ds; + + for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) + n++; + + wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); + if (!wnames) + goto err_out; + + for (ds = dentry, i = (n-1); i >= 0; i--, ds = ds->d_parent) + wnames[i] = (char *)ds->d_name.name; + + *names = wnames; + return n; +err_out: + return -ENOMEM; +} + /** * v9fs_fid_lookup - lookup for a fid, try to walk if not found * @dentry: dentry to look for fid in @@ -112,7 +140,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) int i, n, l, clone, any, access; u32 uid; struct p9_fid *fid, *old_fid = NULL; - struct dentry *d, *ds; + struct dentry *ds; struct v9fs_session_info *v9ses; char **wnames, *uname; @@ -139,49 +167,62 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) fid = v9fs_fid_find(dentry, uid, any); if (fid) return fid; - + /* + * we don't have a matching fid. To do a TWALK we need + * parent fid. We need to prevent rename when we want to + * look at the parent. + */ + down_read(&v9ses->rename_sem); ds = dentry->d_parent; fid = v9fs_fid_find(ds, uid, any); - if (!fid) { /* walk from the root */ - n = 0; - for (ds = dentry; !IS_ROOT(ds); ds = ds->d_parent) - n++; + if (fid) { + /* Found the parent fid do a lookup with that */ + fid = p9_client_walk(fid, 1, (char **)&dentry->d_name.name, 1); + goto fid_out; + } + up_read(&v9ses->rename_sem); - fid = v9fs_fid_find(ds, uid, any); - if (!fid) { /* the user is not attached to the fs yet */ - if (access == V9FS_ACCESS_SINGLE) - return ERR_PTR(-EPERM); + /* start from the root and try to do a lookup */ + fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any); + if (!fid) { + /* the user is not attached to the fs yet */ + if (access == V9FS_ACCESS_SINGLE) + return ERR_PTR(-EPERM); - if (v9fs_proto_dotu(v9ses)) + if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) uname = NULL; - else - uname = v9ses->uname; + else + uname = v9ses->uname; - fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, - v9ses->aname); - - if (IS_ERR(fid)) - return fid; - - v9fs_fid_add(ds, fid); - } - } else /* walk from the parent */ - n = 1; + fid = p9_client_attach(v9ses->clnt, NULL, uname, uid, + v9ses->aname); + if (IS_ERR(fid)) + return fid; - if (ds == dentry) + v9fs_fid_add(dentry->d_sb->s_root, fid); + } + /* If we are root ourself just return that */ + if (dentry->d_sb->s_root == dentry) return fid; - - wnames = kmalloc(sizeof(char *) * n, GFP_KERNEL); - if (!wnames) - return ERR_PTR(-ENOMEM); - - for (d = dentry, i = (n-1); i >= 0; i--, d = d->d_parent) - wnames[i] = (char *) d->d_name.name; - + /* + * Do a multipath walk with attached root. + * When walking parent we need to make sure we + * don't have a parallel rename happening + */ + down_read(&v9ses->rename_sem); + n = build_path_from_dentry(v9ses, dentry, &wnames); + if (n < 0) { + fid = ERR_PTR(n); + goto err_out; + } clone = 1; i = 0; while (i < n) { l = min(n - i, P9_MAXWELEM); + /* + * We need to hold rename lock when doing a multipath + * walk to ensure none of the patch component change + */ fid = p9_client_walk(fid, l, &wnames[i], clone); if (IS_ERR(fid)) { if (old_fid) { @@ -193,15 +234,17 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) p9_client_clunk(old_fid); } kfree(wnames); - return fid; + goto err_out; } old_fid = fid; i += l; clone = 0; } - kfree(wnames); +fid_out: v9fs_fid_add(dentry, fid); +err_out: + up_read(&v9ses->rename_sem); return fid; } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index f8b86e9..38dc0e0 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -237,6 +237,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, __putname(v9ses->uname); return ERR_PTR(-ENOMEM); } + init_rwsem(&v9ses->rename_sem); rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); if (rc) { @@ -278,7 +279,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; /* for legacy mode, fall back to V9FS_ACCESS_ANY */ - if (!v9fs_proto_dotu(v9ses) && + if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { v9ses->flags &= ~V9FS_ACCESS_MASK; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index bec4d0b..4c963c9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -104,6 +104,7 @@ struct v9fs_session_info { struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct backing_dev_info bdi; + struct rw_semaphore rename_sem; }; struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 32ef400..f47c6bb 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -55,6 +55,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode); void v9fs_clear_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index d61e3b2..16c8a2a 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -87,29 +87,19 @@ static void p9stat_init(struct p9_wstat *stbuf) } /** - * v9fs_dir_readdir - read a directory + * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir * @filp: opened file structure - * @dirent: directory structure ??? - * @filldir: function to populate directory structure ??? + * @buflen: Length in bytes of buffer to allocate * */ -static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) { - int over; - struct p9_wstat st; - int err = 0; - struct p9_fid *fid; - int buflen; - int reclen = 0; struct p9_rdir *rdir; + struct p9_fid *fid; + int err = 0; - P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); fid = filp->private_data; - - buflen = fid->clnt->msize - P9_IOHDRSZ; - - /* allocate rdir on demand */ if (!fid->rdir) { rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); @@ -128,6 +118,36 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) spin_unlock(&filp->f_dentry->d_lock); kfree(rdir); } +exit: + return err; +} + +/** + * v9fs_dir_readdir - read a directory + * @filp: opened file structure + * @dirent: directory structure ??? + * @filldir: function to populate directory structure ??? + * + */ + +static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + int over; + struct p9_wstat st; + int err = 0; + struct p9_fid *fid; + int buflen; + int reclen = 0; + struct p9_rdir *rdir; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + fid = filp->private_data; + + buflen = fid->clnt->msize - P9_IOHDRSZ; + + err = v9fs_alloc_rdir_buf(filp, buflen); + if (err) + goto exit; rdir = (struct p9_rdir *) fid->rdir; err = mutex_lock_interruptible(&rdir->mutex); @@ -146,7 +166,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) while (rdir->head < rdir->tail) { p9stat_init(&st); err = p9stat_read(rdir->buf + rdir->head, - buflen - rdir->head, &st, + rdir->tail - rdir->head, &st, fid->clnt->proto_version); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); @@ -176,6 +196,88 @@ exit: return err; } +/** + * v9fs_dir_readdir_dotl - read a directory + * @filp: opened file structure + * @dirent: buffer to fill dirent structures + * @filldir: function to populate dirent structures + * + */ +static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, + filldir_t filldir) +{ + int over; + int err = 0; + struct p9_fid *fid; + int buflen; + struct p9_rdir *rdir; + struct p9_dirent curdirent; + u64 oldoffset = 0; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); + fid = filp->private_data; + + buflen = fid->clnt->msize - P9_READDIRHDRSZ; + + err = v9fs_alloc_rdir_buf(filp, buflen); + if (err) + goto exit; + rdir = (struct p9_rdir *) fid->rdir; + + err = mutex_lock_interruptible(&rdir->mutex); + if (err) + return err; + + while (err == 0) { + if (rdir->tail == rdir->head) { + err = p9_client_readdir(fid, rdir->buf, buflen, + filp->f_pos); + if (err <= 0) + goto unlock_and_exit; + + rdir->head = 0; + rdir->tail = err; + } + + while (rdir->head < rdir->tail) { + + err = p9dirent_read(rdir->buf + rdir->head, + buflen - rdir->head, &curdirent, + fid->clnt->proto_version); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); + err = -EIO; + goto unlock_and_exit; + } + + /* d_off in dirent structure tracks the offset into + * the next dirent in the dir. However, filldir() + * expects offset into the current dirent. Hence + * while calling filldir send the offset from the + * previous dirent structure. + */ + over = filldir(dirent, curdirent.d_name, + strlen(curdirent.d_name), + oldoffset, v9fs_qid2ino(&curdirent.qid), + curdirent.d_type); + oldoffset = curdirent.d_off; + + if (over) { + err = 0; + goto unlock_and_exit; + } + + filp->f_pos = curdirent.d_off; + rdir->head += err; + } + } + +unlock_and_exit: + mutex_unlock(&rdir->mutex); +exit: + return err; +} + /** * v9fs_dir_release - close a directory @@ -207,7 +309,7 @@ const struct file_operations v9fs_dir_operations = { const struct file_operations v9fs_dir_operations_dotl = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = v9fs_dir_readdir, + .readdir = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 2bedc6c..e97c92b 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -59,9 +59,13 @@ int v9fs_file_open(struct inode *inode, struct file *file) struct p9_fid *fid; int omode; - P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); + P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); v9ses = v9fs_inode2v9ses(inode); - omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); + if (v9fs_proto_dotl(v9ses)) + omode = file->f_flags; + else + omode = v9fs_uflags2omode(file->f_flags, + v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file->f_path.dentry); @@ -73,11 +77,12 @@ int v9fs_file_open(struct inode *inode, struct file *file) p9_client_clunk(fid); return err; } - if (omode & P9_OTRUNC) { + if (file->f_flags & O_TRUNC) { i_size_write(inode, 0); inode->i_blocks = 0; } - if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses))) + if ((file->f_flags & O_APPEND) && + (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); } @@ -139,7 +144,7 @@ ssize_t v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, u64 offset) { - int n, total; + int n, total, size; struct p9_fid *fid = filp->private_data; P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, @@ -147,6 +152,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, n = 0; total = 0; + size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; do { n = p9_client_read(fid, data, udata, offset, count); if (n <= 0) @@ -160,7 +166,7 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, offset += n; count -= n; total += n; - } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ)); + } while (count > 0 && n == size); if (n < 0) total = n; @@ -183,11 +189,13 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count, { int ret; struct p9_fid *fid; + size_t size; P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); fid = filp->private_data; - if (count > (fid->clnt->msize - P9_IOHDRSZ)) + size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; + if (count > size) ret = v9fs_file_readn(filp, NULL, udata, count, *offset); else ret = p9_client_read(fid, NULL, udata, *offset, count); @@ -224,9 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data, fid = filp->private_data; clnt = fid->clnt; - rsize = fid->iounit; - if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) - rsize = clnt->msize - P9_IOHDRSZ; + rsize = fid->iounit ? fid->iounit : clnt->msize - P9_IOHDRSZ; do { if (count < rsize) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4331b3b..6e94f32 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -35,6 +35,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/xattr.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -42,6 +43,7 @@ #include "v9fs_vfs.h" #include "fid.h" #include "cache.h" +#include "xattr.h" static const struct inode_operations v9fs_dir_inode_operations; static const struct inode_operations v9fs_dir_inode_operations_dotu; @@ -236,6 +238,41 @@ void v9fs_destroy_inode(struct inode *inode) #endif /** + * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * new file system object. This checks the S_ISGID to determine the owning + * group of the new file system object. + */ + +static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) +{ + BUG_ON(dir_inode == NULL); + + if (dir_inode->i_mode & S_ISGID) { + /* set_gid bit is set.*/ + return dir_inode->i_gid; + } + return current_fsgid(); +} + +/** + * v9fs_dentry_from_dir_inode - helper function to get the dentry from + * dir inode. + * + */ + +static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&dcache_lock); + /* Directory should have only one entry. */ + BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); + dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); + spin_unlock(&dcache_lock); + return dentry; +} + +/** * v9fs_get_inode - helper function to setup an inode * @sb: superblock * @mode: mode to setup inode with @@ -267,7 +304,13 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFBLK: case S_IFCHR: case S_IFSOCK: - if (!v9fs_proto_dotu(v9ses)) { + if (v9fs_proto_dotl(v9ses)) { + inode->i_op = &v9fs_file_inode_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; + } else if (v9fs_proto_dotu(v9ses)) { + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + } else { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); err = -EINVAL; @@ -396,23 +439,14 @@ void v9fs_clear_inode(struct inode *inode) #endif } -/** - * v9fs_inode_from_fid - populate an inode by issuing a attribute request - * @v9ses: session information - * @fid: fid to issue attribute request for - * @sb: superblock on which to create inode - * - */ - static struct inode * -v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, +v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { int err, umode; - struct inode *ret; + struct inode *ret = NULL; struct p9_wstat *st; - ret = NULL; st = p9_client_stat(fid); if (IS_ERR(st)) return ERR_CAST(st); @@ -433,15 +467,62 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, #endif p9stat_free(st); kfree(st); - return ret; - error: p9stat_free(st); kfree(st); return ERR_PTR(err); } +static struct inode * +v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + struct inode *ret = NULL; + int err; + struct p9_stat_dotl *st; + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) + return ERR_CAST(st); + + ret = v9fs_get_inode(sb, st->st_mode); + if (IS_ERR(ret)) { + err = PTR_ERR(ret); + goto error; + } + + v9fs_stat2inode_dotl(st, ret); + ret->i_ino = v9fs_qid2ino(&st->qid); +#ifdef CONFIG_9P_FSCACHE + v9fs_vcookie_set_qid(ret, &st->qid); + v9fs_cache_inode_get_cookie(ret); +#endif + kfree(st); + return ret; +error: + kfree(st); + return ERR_PTR(err); +} + +/** + * v9fs_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_dotl(v9ses, fid, sb); + else + return v9fs_inode(v9ses, fid, sb); +} + /** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted @@ -563,6 +644,118 @@ error: } /** + * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @dir: directory inode that is being created + * @dentry: dentry that is being deleted + * @mode: create permissions + * @nd: path information + * + */ + +static int +v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + int err = 0; + char *name = NULL; + gid_t gid; + int flags; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL; + struct p9_fid *dfid, *ofid; + struct file *filp; + struct p9_qid qid; + struct inode *inode; + + v9ses = v9fs_inode2v9ses(dir); + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else + flags = O_RDWR; + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " + "mode:0x%x\n", name, flags, mode); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + /* clone a fid to use for creation */ + ofid = p9_client_walk(dfid, 0, NULL, 1); + if (IS_ERR(ofid)) { + err = PTR_ERR(ofid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_open_dotl failed in creat %d\n", + err); + goto error; + } + + /* No need to populate the inode if we are not opening the file AND + * not in cached mode. + */ + if (!v9ses->cache && !(nd && nd->flags & LOOKUP_OPEN)) { + /* Not in cached mode. No need to populate inode with stat */ + dentry->d_op = &v9fs_dentry_operations; + p9_client_clunk(ofid); + d_instantiate(dentry, NULL); + return 0; + } + + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + + /* if we are opening a file, assign the open fid to the file */ + if (nd && nd->flags & LOOKUP_OPEN) { + filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); + if (IS_ERR(filp)) { + p9_client_clunk(ofid); + return PTR_ERR(filp); + } + filp->private_data = ofid; + } else + p9_client_clunk(ofid); + + return 0; + +error: + if (ofid) + p9_client_clunk(ofid); + if (fid) + p9_client_clunk(fid); + return err; +} + +/** * v9fs_vfs_create - VFS hook to create files * @dir: directory inode that is being created * @dentry: dentry that is being deleted @@ -652,6 +845,83 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return err; } + +/** + * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @dir: inode that is being unlinked + * @dentry: dentry that is being unlinked + * @mode: mode for new directory + * + */ + +static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry, + int mode) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + gid_t gid; + char *name; + struct inode *inode; + struct p9_qid qid; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + err = 0; + v9ses = v9fs_inode2v9ses(dir); + + mode |= S_IFDIR; + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); + goto error; + } + + name = (char *) dentry->d_name.name; + err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } +error: + if (fid) + p9_client_clunk(fid); + return err; +} + /** * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode * @dir: inode that is being walked from @@ -678,6 +948,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); + /* We can walk d_parent because we hold the dir->i_mutex */ dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) return ERR_CAST(dfid); @@ -785,27 +1056,33 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto clunk_olddir; } + down_write(&v9ses->rename_sem); if (v9fs_proto_dotl(v9ses)) { retval = p9_client_rename(oldfid, newdirfid, (char *) new_dentry->d_name.name); if (retval != -ENOSYS) goto clunk_newdir; } + if (old_dentry->d_parent != new_dentry->d_parent) { + /* + * 9P .u can only handle file rename in the same directory + */ - /* 9P can only handle file rename in the same directory */ - if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { P9_DPRINTK(P9_DEBUG_ERROR, "old dir and new dir are different\n"); retval = -EXDEV; goto clunk_newdir; } - v9fs_blank_wstat(&wstat); wstat.muid = v9ses->uname; wstat.name = (char *) new_dentry->d_name.name; retval = p9_client_wstat(oldfid, &wstat); clunk_newdir: + if (!retval) + /* successful rename */ + d_move(old_dentry, new_dentry); + up_write(&v9ses->rename_sem); p9_client_clunk(newdirfid); clunk_olddir: @@ -853,6 +1130,42 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int +v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_stat_dotl *st; + + P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); + err = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + return simple_getattr(mnt, dentry, stat); + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + /* Ask for all the fields in stat structure. Server will return + * whatever it supports + */ + + st = p9_client_getattr_dotl(fid, P9_STATS_ALL); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, dentry->d_inode); + generic_fillattr(dentry->d_inode, stat); + /* Change block size to what the server returned */ + stat->blksize = st->st_blksize; + + kfree(st); + return 0; +} + /** * v9fs_vfs_setattr - set file metadata * @dentry: file whose metadata to set @@ -903,6 +1216,49 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) } /** + * v9fs_vfs_setattr_dotl - set file metadata + * @dentry: file whose metadata to set + * @iattr: metadata assignment structure + * + */ + +static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +{ + int retval; + struct v9fs_session_info *v9ses; + struct p9_fid *fid; + struct p9_iattr_dotl p9attr; + + P9_DPRINTK(P9_DEBUG_VFS, "\n"); + + retval = inode_change_ok(dentry->d_inode, iattr); + if (retval) + return retval; + + p9attr.valid = iattr->ia_valid; + p9attr.mode = iattr->ia_mode; + p9attr.uid = iattr->ia_uid; + p9attr.gid = iattr->ia_gid; + p9attr.size = iattr->ia_size; + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + + retval = -EPERM; + v9ses = v9fs_inode2v9ses(dentry->d_inode); + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + retval = p9_client_setattr(fid, &p9attr); + if (retval >= 0) + retval = inode_setattr(dentry->d_inode, iattr); + + return retval; +} + +/** * v9fs_stat2inode - populate an inode structure with mistat info * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate @@ -980,6 +1336,77 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, } /** + * v9fs_stat2inode_dotl - populate an inode structure with stat info + * @stat: stat structure + * @inode: inode to populate + * @sb: superblock of filesystem + * + */ + +void +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +{ + + if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode->i_uid = stat->st_uid; + inode->i_gid = stat->st_gid; + inode->i_nlink = stat->st_nlink; + inode->i_mode = stat->st_mode; + inode->i_rdev = new_decode_dev(stat->st_rdev); + + if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, inode->i_rdev); + + i_size_write(inode, stat->st_size); + inode->i_blocks = stat->st_blocks; + } else { + if (stat->st_result_mask & P9_STATS_ATIME) { + inode->i_atime.tv_sec = stat->st_atime_sec; + inode->i_atime.tv_nsec = stat->st_atime_nsec; + } + if (stat->st_result_mask & P9_STATS_MTIME) { + inode->i_mtime.tv_sec = stat->st_mtime_sec; + inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + } + if (stat->st_result_mask & P9_STATS_CTIME) { + inode->i_ctime.tv_sec = stat->st_ctime_sec; + inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + } + if (stat->st_result_mask & P9_STATS_UID) + inode->i_uid = stat->st_uid; + if (stat->st_result_mask & P9_STATS_GID) + inode->i_gid = stat->st_gid; + if (stat->st_result_mask & P9_STATS_NLINK) + inode->i_nlink = stat->st_nlink; + if (stat->st_result_mask & P9_STATS_MODE) { + inode->i_mode = stat->st_mode; + if ((S_ISBLK(inode->i_mode)) || + (S_ISCHR(inode->i_mode))) + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + } + if (stat->st_result_mask & P9_STATS_RDEV) + inode->i_rdev = new_decode_dev(stat->st_rdev); + if (stat->st_result_mask & P9_STATS_SIZE) + i_size_write(inode, stat->st_size); + if (stat->st_result_mask & P9_STATS_BLOCKS) + inode->i_blocks = stat->st_blocks; + } + if (stat->st_result_mask & P9_STATS_GEN) + inode->i_generation = stat->st_gen; + + /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION + * because the inode structure does not have fields for them. + */ +} + +/** * v9fs_qid2ino - convert qid into inode number * @qid: qid to hash * @@ -1022,7 +1449,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) if (IS_ERR(fid)) return PTR_ERR(fid); - if (!v9fs_proto_dotu(v9ses)) + if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) return -EBADF; st = p9_client_stat(fid); @@ -1128,6 +1555,99 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, } /** + * v9fs_vfs_symlink_dotl - helper function to create symlinks + * @dir: directory inode containing symlink + * @dentry: dentry for symlink + * @symname: symlink data + * + * See Also: 9P2000.L RFC for more information + * + */ + +static int +v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct v9fs_session_info *v9ses; + struct p9_fid *dfid; + struct p9_fid *fid = NULL; + struct inode *inode; + struct p9_qid qid; + char *name; + int err; + gid_t gid; + + name = (char *) dentry->d_name.name; + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", + dir->i_ino, name, symname); + v9ses = v9fs_inode2v9ses(dir); + + dfid = v9fs_fid_lookup(dentry->d_parent); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + return err; + } + + gid = v9fs_get_fsgid_for_create(dir); + + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_egid failed %d\n", gid); + goto error; + } + + /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ + err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); + goto error; + } + + if (v9ses->cache) { + /* Now walk from the parent so we can get an unopened fid. */ + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + /* instantiate inode and assign the unopened fid to dentry */ + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* Not in cached mode. No need to populate inode with stat */ + inode = v9fs_get_inode(dir->i_sb, S_IFLNK); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + + return err; +} + +/** * v9fs_vfs_symlink - helper function to create symlinks * @dir: directory inode containing symlink * @dentry: dentry for symlink @@ -1186,6 +1706,76 @@ clunk_fid: } /** + * v9fs_vfs_link_dotl - create a hardlink for dotl + * @old_dentry: dentry for file to link to + * @dir: inode destination for new link + * @dentry: dentry for link + * + */ + +static int +v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int err; + struct p9_fid *dfid, *oldfid; + char *name; + struct v9fs_session_info *v9ses; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", + dir->i_ino, old_dentry->d_name.name, + dentry->d_name.name); + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) + return PTR_ERR(dfid); + + oldfid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(oldfid)) + return PTR_ERR(oldfid); + + name = (char *) dentry->d_name.name; + + err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); + + if (err < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); + return err; + } + + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + /* Get the latest stat info from server. */ + struct p9_fid *fid; + struct p9_stat_dotl *st; + + fid = v9fs_fid_lookup(old_dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) + return PTR_ERR(st); + + v9fs_stat2inode_dotl(st, old_dentry->d_inode); + + kfree(st); + } else { + /* Caching disabled. No need to get upto date stat info. + * This dentry will be released immediately. So, just i_count++ + */ + atomic_inc(&old_dentry->d_inode->i_count); + } + + dentry->d_op = old_dentry->d_op; + d_instantiate(dentry, old_dentry->d_inode); + + return err; +} + +/** * v9fs_vfs_mknod - create a special file * @dir: inode destination for new link * @dentry: dentry for file @@ -1230,6 +1820,100 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) return retval; } +/** + * v9fs_vfs_mknod_dotl - create a special file + * @dir: inode destination for new link + * @dentry: dentry for file + * @mode: mode for creation + * @rdev: device associated with special file + * + */ +static int +v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + int err; + char *name; + struct v9fs_session_info *v9ses; + struct p9_fid *fid = NULL, *dfid = NULL; + struct inode *inode; + gid_t gid; + struct p9_qid qid; + struct dentry *dir_dentry; + + P9_DPRINTK(P9_DEBUG_VFS, + " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, + dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + v9ses = v9fs_inode2v9ses(dir); + dir_dentry = v9fs_dentry_from_dir_inode(dir); + dfid = v9fs_fid_lookup(dir_dentry); + if (IS_ERR(dfid)) { + err = PTR_ERR(dfid); + P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); + dfid = NULL; + goto error; + } + + gid = v9fs_get_fsgid_for_create(dir); + if (gid < 0) { + P9_DPRINTK(P9_DEBUG_VFS, "v9fs_get_fsgid_for_create failed\n"); + goto error; + } + + name = (char *) dentry->d_name.name; + + err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); + if (err < 0) + goto error; + + /* instantiate inode and assign the unopened fid to the dentry */ + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + fid = p9_client_walk(dfid, 1, &name, 1); + if (IS_ERR(fid)) { + err = PTR_ERR(fid); + P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", + err); + fid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", + err); + goto error; + } + dentry->d_op = &v9fs_cached_dentry_operations; + d_instantiate(dentry, inode); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + fid = NULL; + } else { + /* + * Not in cached mode. No need to populate inode with stat. + * socket syscall returns a fd, so we need instantiate + */ + inode = v9fs_get_inode(dir->i_sb, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto error; + } + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + } + +error: + if (fid) + p9_client_clunk(fid); + return err; +} + static const struct inode_operations v9fs_dir_inode_operations_dotu = { .create = v9fs_vfs_create, .lookup = v9fs_vfs_lookup, @@ -1238,24 +1922,29 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = { .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir, .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod, + .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; static const struct inode_operations v9fs_dir_inode_operations_dotl = { - .create = v9fs_vfs_create, + .create = v9fs_vfs_create_dotl, .lookup = v9fs_vfs_lookup, - .symlink = v9fs_vfs_symlink, - .link = v9fs_vfs_link, + .link = v9fs_vfs_link_dotl, + .symlink = v9fs_vfs_symlink_dotl, .unlink = v9fs_vfs_unlink, - .mkdir = v9fs_vfs_mkdir, + .mkdir = v9fs_vfs_mkdir_dotl, .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod, + .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, + }; static const struct inode_operations v9fs_dir_inode_operations = { @@ -1276,8 +1965,12 @@ static const struct inode_operations v9fs_file_inode_operations = { }; static const struct inode_operations v9fs_file_inode_operations_dotl = { - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, }; static const struct inode_operations v9fs_symlink_inode_operations = { @@ -1292,6 +1985,10 @@ static const struct inode_operations v9fs_symlink_inode_operations_dotl = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, .put_link = v9fs_vfs_put_link, - .getattr = v9fs_vfs_getattr, - .setattr = v9fs_vfs_setattr, + .getattr = v9fs_vfs_getattr_dotl, + .setattr = v9fs_vfs_setattr_dotl, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = v9fs_listxattr, }; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index be74d02..4b9ede0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -45,6 +45,7 @@ #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" +#include "xattr.h" static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; @@ -77,9 +78,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; - if (v9fs_proto_dotl(v9ses)) + if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; - else + sb->s_xattr = v9fs_xattr_handlers; + } else sb->s_op = &v9fs_super_ops; sb->s_bdi = &v9ses->bdi; @@ -107,7 +109,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; struct p9_fid *fid; int retval = 0; @@ -124,16 +125,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, goto close_session; } - st = p9_client_stat(fid); - if (IS_ERR(st)) { - retval = PTR_ERR(st); - goto clunk_fid; - } - sb = sget(fs_type, NULL, v9fs_set_super, v9ses); if (IS_ERR(sb)) { retval = PTR_ERR(sb); - goto free_stat; + goto clunk_fid; } v9fs_fill_super(sb, v9ses, flags, data); @@ -151,22 +146,38 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, } sb->s_root = root; - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, root->d_inode, sb); + if (v9fs_proto_dotl(v9ses)) { + struct p9_stat_dotl *st = NULL; + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto clunk_fid; + } + + v9fs_stat2inode_dotl(st, root->d_inode); + kfree(st); + } else { + struct p9_wstat *st = NULL; + st = p9_client_stat(fid); + if (IS_ERR(st)) { + retval = PTR_ERR(st); + goto clunk_fid; + } + + root->d_inode->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, root->d_inode, sb); + + p9stat_free(st); + kfree(st); + } v9fs_fid_add(root, fid); - p9stat_free(st); - kfree(st); P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; -free_stat: - p9stat_free(st); - kfree(st); - clunk_fid: p9_client_clunk(fid); @@ -176,8 +187,6 @@ close_session: return retval; release_sb: - p9stat_free(st); - kfree(st); deactivate_locked_super(sb); return retval; } @@ -278,4 +287,5 @@ struct file_system_type v9fs_fs_type = { .get_sb = v9fs_get_sb, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, + .fs_flags = FS_RENAME_DOES_D_MOVE, }; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c new file mode 100644 index 0000000..f88e5c2 --- /dev/null +++ b/fs/9p/xattr.c @@ -0,0 +1,160 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> + +#include "fid.h" +#include "xattr.h" + +/* + * v9fs_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t buffer_size) +{ + ssize_t retval; + int msize, read_count; + u64 offset = 0, attr_size; + struct p9_fid *fid, *attr_fid; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", + __func__, name, buffer_size); + + fid = v9fs_fid_lookup(dentry); + if (IS_ERR(fid)) + return PTR_ERR(fid); + + attr_fid = p9_client_xattrwalk(fid, name, &attr_size); + if (IS_ERR(attr_fid)) { + retval = PTR_ERR(attr_fid); + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_attrwalk failed %zd\n", retval); + attr_fid = NULL; + goto error; + } + if (!buffer_size) { + /* request to get the attr_size */ + retval = attr_size; + goto error; + } + if (attr_size > buffer_size) { + retval = -ERANGE; + goto error; + } + msize = attr_fid->clnt->msize; + while (attr_size) { + if (attr_size > (msize - P9_IOHDRSZ)) + read_count = msize - P9_IOHDRSZ; + else + read_count = attr_size; + read_count = p9_client_read(attr_fid, ((char *)buffer)+offset, + NULL, offset, read_count); + if (read_count < 0) { + /* error in xattr read */ + retval = read_count; + goto error; + } + offset += read_count; + attr_size -= read_count; + } + /* Total read xattr bytes */ + retval = offset; +error: + if (attr_fid) + p9_client_clunk(attr_fid); + return retval; + +} + +/* + * v9fs_xattr_set() + * + * Create, replace or remove an extended attribute for this inode. Buffer + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int v9fs_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t value_len, int flags) +{ + u64 offset = 0; + int retval, msize, write_count; + struct p9_fid *fid = NULL; + + P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n", + __func__, name, value_len, flags); + + fid = v9fs_fid_clone(dentry); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + fid = NULL; + goto error; + } + /* + * On success fid points to xattr + */ + retval = p9_client_xattrcreate(fid, name, value_len, flags); + if (retval < 0) { + P9_DPRINTK(P9_DEBUG_VFS, + "p9_client_xattrcreate failed %d\n", retval); + goto error; + } + msize = fid->clnt->msize;; + while (value_len) { + if (value_len > (msize - P9_IOHDRSZ)) + write_count = msize - P9_IOHDRSZ; + else + write_count = value_len; + write_count = p9_client_write(fid, ((char *)value)+offset, + NULL, offset, write_count); + if (write_count < 0) { + /* error in xattr write */ + retval = write_count; + goto error; + } + offset += write_count; + value_len -= write_count; + } + /* Total read xattr bytes */ + retval = offset; +error: + if (fid) + retval = p9_client_clunk(fid); + return retval; +} + +ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + return v9fs_xattr_get(dentry, NULL, buffer, buffer_size); +} + +const struct xattr_handler *v9fs_xattr_handlers[] = { + &v9fs_xattr_user_handler, + NULL +}; diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h new file mode 100644 index 0000000..9ddf672 --- /dev/null +++ b/fs/9p/xattr.h @@ -0,0 +1,27 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ +#ifndef FS_9P_XATTR_H +#define FS_9P_XATTR_H + +#include <linux/xattr.h> + +extern const struct xattr_handler *v9fs_xattr_handlers[]; +extern struct xattr_handler v9fs_xattr_user_handler; + +extern ssize_t v9fs_xattr_get(struct dentry *, const char *, + void *, size_t); +extern int v9fs_xattr_set(struct dentry *, const char *, + const void *, size_t, int); +extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t); +#endif /* FS_9P_XATTR_H */ diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c new file mode 100644 index 0000000..d0b701b --- /dev/null +++ b/fs/9p/xattr_user.c @@ -0,0 +1,80 @@ +/* + * Copyright IBM Corporation, 2010 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include "xattr.h" + +static int v9fs_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name+prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_get(dentry, full_name, buffer, size); + kfree(full_name); + return retval; +} + +static int v9fs_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + int retval; + char *full_name; + size_t name_len; + size_t prefix_len = XATTR_USER_PREFIX_LEN; + + if (name == NULL) + return -EINVAL; + + if (strcmp(name, "") == 0) + return -EINVAL; + + name_len = strlen(name); + full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL); + if (!full_name) + return -ENOMEM; + memcpy(full_name, XATTR_USER_PREFIX, prefix_len); + memcpy(full_name + prefix_len, name, name_len); + full_name[prefix_len + name_len] = '\0'; + + retval = v9fs_xattr_set(dentry, full_name, value, size, flags); + kfree(full_name); + return retval; +} + +struct xattr_handler v9fs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = v9fs_xattr_user_get, + .set = v9fs_xattr_user_set, +}; @@ -64,7 +64,7 @@ source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" config CUSE - tristate "Character device in Userpace support" + tristate "Character device in Userspace support" depends on FUSE_FS help This FUSE extension allows character devices to be diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 5c4e61d..8f975f2 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -2,6 +2,7 @@ config AFS_FS tristate "Andrew File System support (AFS) (EXPERIMENTAL)" depends on INET && EXPERIMENTAL select AF_RXRPC + select DNS_RESOLVER help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/cell.c b/fs/afs/cell.c index e19c13f..ffea35c 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/key.h> #include <linux/ctype.h> +#include <linux/dns_resolver.h> #include <linux/sched.h> #include <keys/rxrpc-type.h> #include "internal.h" @@ -36,6 +37,8 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) struct key *key; size_t namelen; char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; + char *dvllist = NULL, *_vllist = NULL; + char delimiter = ':'; int ret; _enter("%s,%s", name, vllist); @@ -43,8 +46,10 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ namelen = strlen(name); - if (namelen > AFS_MAXCELLNAME) + if (namelen > AFS_MAXCELLNAME) { + _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); + } /* allocate and initialise a cell record */ cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); @@ -64,15 +69,31 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) INIT_LIST_HEAD(&cell->vl_list); spin_lock_init(&cell->vl_lock); + /* if the ip address is invalid, try dns query */ + if (!vllist || strlen(vllist) < 7) { + ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); + if (ret < 0) { + _leave(" = %d", ret); + return ERR_PTR(ret); + } + _vllist = dvllist; + + /* change the delimiter for user-space reply */ + delimiter = ','; + + } else { + _vllist = vllist; + } + /* fill in the VL server list from the rest of the string */ do { unsigned a, b, c, d; - next = strchr(vllist, ':'); + next = strchr(_vllist, delimiter); if (next) *next++ = 0; - if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) + if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) goto bad_address; if (a > 255 || b > 255 || c > 255 || d > 255) @@ -81,7 +102,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) cell->vl_addrs[cell->vl_naddrs++].s_addr = htonl((a << 24) | (b << 16) | (c << 8) | d); - } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next)); + } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next)); /* create a key to represent an anonymous user */ memcpy(keyname, "afs@", 4); @@ -110,6 +131,7 @@ bad_address: ret = -EINVAL; error: key_put(cell->anonymous_key); + kfree(dvllist); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); @@ -201,14 +223,12 @@ int afs_cell_init(char *rootcell) } cp = strchr(rootcell, ':'); - if (!cp) { - printk(KERN_ERR "kAFS: no VL server IP addresses specified\n"); - _leave(" = -EINVAL"); - return -EINVAL; - } + if (!cp) + _debug("kAFS: no VL server IP addresses specified"); + else + *cp++ = 0; /* allocate a cell record for the root cell */ - *cp++ = 0; new_root = afs_cell_create(rootcell, cp); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); diff --git a/fs/afs/main.c b/fs/afs/main.c index 66d54d3..cfd1cbe 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -111,6 +111,8 @@ static int __init afs_init(void) /* initialise the callback update process */ ret = afs_callback_update_init(); + if (ret < 0) + goto error_callback_update_init; /* create the RxRPC transport */ ret = afs_open_socket(); @@ -127,15 +129,16 @@ static int __init afs_init(void) error_fs: afs_close_socket(); error_open_socket: + afs_callback_update_kill(); +error_callback_update_init: + afs_vlocation_purge(); error_vl_update_init: + afs_cell_purge(); error_cell_init: #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); error_cache: #endif - afs_callback_update_kill(); - afs_vlocation_purge(); - afs_cell_purge(); afs_proc_cleanup(); rcu_barrier(); printk(KERN_ERR "kAFS: failed to register: %d\n", ret); diff --git a/fs/afs/write.c b/fs/afs/write.c index 3dab9e9..722743b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode) { struct address_space *mapping = vnode->vfs_inode.i_mapping; struct writeback_control wbc = { - .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_cyclic = 1, @@ -1277,7 +1277,7 @@ out: /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not - * implemented. May fail with -EFAULT if the context pointed to + * implemented. May fail with -EINVAL if the context pointed to * is invalid. */ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) @@ -1795,15 +1795,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, /* io_getevents: * Attempts to read at least min_nr events and up to nr events from - * the completion queue for the aio_context specified by ctx_id. May - * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, - * if nr is out of range, if when is out of range. May fail with - * -EFAULT if any of the memory specified to is invalid. May return - * 0 or < min_nr if no events are available and the timeout specified - * by when has elapsed, where when == NULL specifies an infinite - * timeout. Note that the timeout pointed to by when is relative and - * will be updated if not NULL and the operation blocks. Will fail - * with -ENOSYS if not implemented. + * the completion queue for the aio_context specified by ctx_id. If + * it succeeds, the number of read events is returned. May fail with + * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is + * out of range, if timeout is out of range. May fail with -EFAULT + * if any of the memory specified is invalid. May return 0 or + * < min_nr if the timeout specified by timeout has elapsed + * before sufficient events are available, where timeout == NULL + * specifies an infinite timeout. Note that the timeout pointed to by + * timeout is relative and will be updated if not NULL and the + * operation blocks. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 34ddda8..dc39d28 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -436,7 +436,7 @@ befs_init_inodecache(void) init_once); if (befs_inode_cachep == NULL) { printk(KERN_ERR "befs_init_inodecache: " - "Couldn't initalize inode slabcache\n"); + "Couldn't initialize inode slabcache\n"); return -ENOMEM; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 99d6af8..b3171fb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -681,8 +681,8 @@ retry: if (!bd_may_claim(bdev, whole, holder)) return -EBUSY; - /* if someone else is claiming, wait for it to finish */ - if (whole->bd_claiming && whole->bd_claiming != holder) { + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); DEFINE_WAIT(wait); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0d1d966..c3df14c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, - int free_space, u32 left_nritems) + int free_space, u32 left_nritems, + u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; @@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (empty) nr = 0; else - nr = 1; + nr = max_t(u32, 1, min_slot); if (path->slots[0] >= left_nritems) push_space += data_size; @@ -2469,10 +2474,14 @@ out_unlock: * * returns 1 if the push failed because the other node didn't have enough * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot */ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - return __push_leaf_right(trans, root, path, data_size, empty, - right, free_space, left_nritems); + return __push_leaf_right(trans, root, path, min_data_size, empty, + right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -2525,12 +2534,17 @@ out_unlock: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, - int free_space, int right_nritems) + int free_space, u32 right_nritems, + u32 max_slot) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; @@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, slot = path->slots[1]; if (empty) - nr = right_nritems; + nr = min(right_nritems, max_slot); else - nr = right_nritems - 1; + nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { item = btrfs_item_nr(right, i); @@ -2712,10 +2726,14 @@ out: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items */ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) { struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; @@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - return __push_leaf_left(trans, root, path, data_size, - empty, left, free_space, right_nritems); + return __push_leaf_left(trans, root, path, min_data_size, + empty, left, free_space, right_nritems, + max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -2855,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } /* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + + slot = path->slots[0]; + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; +} + +/* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. * @@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int wret; int split; int num_doubles = 0; + int tried_avoid_double = 0; l = path->nodes[0]; slot = path->slots[0]; @@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { - wret = push_leaf_right(trans, root, path, data_size, 0); + if (data_size) { + wret = push_leaf_right(trans, root, path, data_size, + data_size, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, 0); + wret = push_leaf_left(trans, root, path, data_size, + data_size, 0, (u32)-1); if (wret < 0) return wret; } @@ -2923,6 +3003,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2; } } @@ -2939,6 +3021,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2 ; } } @@ -3019,6 +3103,13 @@ again: } return ret; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + goto again; } static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, @@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, extent_buffer_get(leaf); btrfs_set_path_blocking(path); - wret = push_leaf_left(trans, root, path, 1, 1); + wret = push_leaf_left(trans, root, path, 1, 1, + 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, 1); + wret = push_leaf_right(trans, root, path, 1, + 1, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a4080c2..d74e6af 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2594,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { - .bdi = wbc->bdi, .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, @@ -2628,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .sync_io = mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { - .bdi = inode->i_mapping->backing_dev_info, .sync_mode = mode, .older_than_this = NULL, .nr_to_write = nr_pages * 2, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4dbaf89..9254b3d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, */ /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE)) + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; ret = mnt_want_write(file->f_path.mnt); @@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* determine range to clone */ ret = -EINVAL; - if (off >= src->i_size || off + len > src->i_size) + if (off + len > src->i_size || off + len < off) goto out_unlock; if (len == 0) olen = len = src->i_size - off; @@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; + u64 endoff; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, @@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_release_path(root, path); inode->i_mtime = inode->i_ctime = CURRENT_TIME; - if (new_key.offset + datal > inode->i_size) - btrfs_i_size_write(inode, - new_key.offset + datal); + + /* + * we round up to the block size at eof when + * determining which extents to clone above, + * but shouldn't round up the file size + */ + endoff = new_key.offset + datal; + if (endoff > off+olen) + endoff = off+olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index f4a7840..42c7faf 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object, printk(KERN_ERR "%sobject: OBJ%x\n", prefix, object->fscache.debug_id); - printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n", + printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", prefix, fscache_object_states[object->fscache.state], - object->fscache.flags, object->fscache.work.flags, + object->fscache.flags, work_busy(&object->fscache.work), object->fscache.events, object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", @@ -212,7 +212,7 @@ wait_for_old_object: /* if the object we're waiting for is queued for processing, * then just put ourselves on the queue behind it */ - if (slow_work_is_queued(&xobject->fscache.work)) { + if (work_pending(&xobject->fscache.work)) { _debug("queue OBJ%x behind OBJ%x immediately", object->fscache.debug_id, xobject->fscache.debug_id); @@ -220,8 +220,7 @@ wait_for_old_object: } /* otherwise we sleep until either the object we're waiting for - * is done, or the slow-work facility wants the thread back to - * do other work */ + * is done, or the fscache_object is congested */ wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); init_wait(&wait); requeue = false; @@ -229,8 +228,8 @@ wait_for_old_object: prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) break; - requeue = slow_work_sleep_till_thread_needed( - &object->fscache.work, &timeout); + + requeue = fscache_object_sleep_till_congested(&timeout); } while (timeout > 0 && !requeue); finish_wait(wq, &wait); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 0f0d41f..0e3c092 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_FAST; + op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; pagevec_init(&pagevec, 0); @@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, pagevec_init(&pagevec, 0); op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_FAST; + op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; INIT_LIST_HEAD(&backpages); diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 04b8280..bc87b9c 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -2,7 +2,7 @@ config CEPH_FS tristate "Ceph distributed file system (EXPERIMENTAL)" depends on INET && EXPERIMENTAL select LIBCRC32C - select CONFIG_CRYPTO_AES + select CRYPTO_AES help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 83d4d27..6d44053 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -493,7 +493,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } - op = le32_to_cpu(head->op); + op = le16_to_cpu(head->op); result = le32_to_cpu(head->result); dout("handle_reply op %d result %d\n", op, result); switch (op) { @@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } + if (xi->auth_authorizer.buf) + ceph_buffer_put(xi->auth_authorizer.buf); + kfree(ac->private); ac->private = NULL; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 619b616..b81be9a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -244,8 +244,14 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ - if (!ctx) - return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (!ctx) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (cap) { + caps_use_count++; + caps_total_count++; + } + return cap; + } spin_lock(&caps_list_lock); dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", @@ -621,7 +627,7 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); spin_unlock(&inode->i_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } @@ -1175,7 +1181,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, } if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return delayed; } @@ -2147,7 +2153,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) else if (flushsnaps) ceph_flush_snaps(ci); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (put) iput(inode); } @@ -2223,7 +2229,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, iput(inode); } else if (complete_capsnap) { ceph_flush_snaps(ci); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } if (drop_capsnap) iput(inode); @@ -2399,7 +2405,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (queue_invalidate) ceph_queue_invalidate(inode); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, @@ -2454,7 +2460,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up(&mdsc->cap_flushing_wq); + wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { @@ -2466,7 +2472,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); out: spin_unlock(&inode->i_lock); @@ -2886,18 +2892,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; + int used, dirty; int ret = 0; - int used = 0; spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); - dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode, - mds, ceph_cap_string(used), ceph_cap_string(drop), + dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", + inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), ceph_cap_string(unless)); - /* only drop unused caps */ - drop &= ~used; + /* only drop unused, clean caps */ + drop &= ~(used | dirty); cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { @@ -2977,6 +2984,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, memcpy(*p, dentry->d_name.name, dentry->d_name.len); *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); } spin_unlock(&dentry->d_lock); return ret; diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c index 9ba54ef..a4eec13 100644 --- a/fs/ceph/crush/mapper.c +++ b/fs/ceph/crush/mapper.c @@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { - dprintk("choose %d x=%d r=%d\n", in->id, x, r); + dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: return bucket_uniform_choose((struct crush_bucket_uniform *)in, @@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) */ static int is_out(struct crush_map *map, __u32 *weight, int item, int x) { - if (weight[item] >= 0x1000) + if (weight[item] >= 0x10000) return 0; if (weight[item] == 0) return 1; @@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map, int itemtype; int collide, reject; const int orig_tries = 5; /* attempts before we fall back to search */ - dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos); + + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); for (rep = outpos; rep < numrep; rep++) { /* keep trying until we get a non-out, non-colliding item */ @@ -366,6 +368,7 @@ static int crush_choose(struct crush_map *map, BUG_ON(item >= 0 || (-1-item) >= map->max_buckets); in = map->buckets[-1-item]; + retry_bucket = 1; continue; } @@ -377,15 +380,25 @@ static int crush_choose(struct crush_map *map, } } - if (recurse_to_leaf && - item < 0 && - crush_choose(map, map->buckets[-1-item], - weight, - x, outpos+1, 0, - out2, outpos, - firstn, 0, NULL) <= outpos) { - reject = 1; - } else { + reject = 0; + if (recurse_to_leaf) { + if (item < 0) { + if (crush_choose(map, + map->buckets[-1-item], + weight, + x, outpos+1, 0, + out2, outpos, + firstn, 0, + NULL) <= outpos) + /* didn't get leaf */ + reject = 1; + } else { + /* we already have a leaf! */ + out2[outpos] = item; + } + } + + if (!reject) { /* out? */ if (itemtype == 0) reject = is_out(map, weight, @@ -424,12 +437,12 @@ reject: continue; } - dprintk("choose got %d\n", item); + dprintk("CHOOSE got %d\n", item); out[outpos] = item; outpos++; } - dprintk("choose returns %d\n", outpos); + dprintk("CHOOSE returns %d\n", outpos); return outpos; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3be33fb..f2f5332 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -261,7 +261,7 @@ static int osdc_show(struct seq_file *s, void *pp) static int caps_show(struct seq_file *s, void *p) { - struct ceph_client *client = p; + struct ceph_client *client = s->private; int total, avail, used, reserved, min; ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f857193..f94ed3c 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -266,6 +266,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) spin_lock(&inode->i_lock); if ((filp->f_pos == 2 || fi->dentry) && !ceph_test_opt(client, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && (ci->i_ceph_flags & CEPH_I_COMPLETE) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { err = __dcache_readdir(filp, dirent, filldir); @@ -1013,18 +1014,22 @@ out_touch: /* * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen. + * of the current dir gen or if this is in the snapshot namespace. */ static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); struct inode *parent_inode = dentry->d_parent->d_inode; + u64 snapid = ceph_snap(parent_inode); - if (parent_inode) { + dout("dentry_release %p parent %p\n", dentry, parent_inode); + + if (parent_inode && snapid != CEPH_SNAPDIR) { struct ceph_inode_info *ci = ceph_inode(parent_inode); spin_lock(&parent_inode->i_lock); - if (ci->i_shared_gen == di->lease_shared_gen) { + if (ci->i_shared_gen == di->lease_shared_gen || + snapid <= CEPH_MAXSNAP) { dout(" clearing %p complete (d_release)\n", parent_inode); ci->i_ceph_flags &= ~CEPH_I_COMPLETE; @@ -1241,7 +1246,9 @@ struct dentry_operations ceph_dentry_ops = { struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, + .d_release = ceph_dentry_release, }; struct dentry_operations ceph_snap_dentry_ops = { + .d_release = ceph_dentry_release, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6251a15..7c08698 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -265,7 +265,7 @@ int ceph_release(struct inode *inode, struct file *file) kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ab47f46..389f9db 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -854,8 +854,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, d_drop(dn); realdn = d_materialise_unique(dn, in); if (IS_ERR(realdn)) { - pr_err("splice_dentry error %p inode %p ino %llx.%llx\n", - dn, in, ceph_vinop(in)); + pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); if (prehash) *prehash = false; /* don't rehash on error */ dn = realdn; /* note realdn contains the error */ @@ -1199,8 +1199,10 @@ retry_lookup: goto out; } err = ceph_init_dentry(dn); - if (err < 0) + if (err < 0) { + dput(dn); goto out; + } } else if (dn->d_inode && (ceph_ino(dn->d_inode) != vino.ino || ceph_snap(dn->d_inode) != vino.snap)) { @@ -1234,18 +1236,23 @@ retry_lookup: goto out; } dn = splice_dentry(dn, in, NULL); + if (IS_ERR(dn)) + dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); - dput(dn); - continue; + goto next_item; } - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, req->r_request_started); - dput(dn); + if (dn) + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); +next_item: + if (dn) + dput(dn); } req->r_did_prepopulate = true; @@ -1494,7 +1501,7 @@ retry: if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1766947..dd440bd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -868,7 +868,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&inode->i_lock); ci->i_wanted_max_size = 0; @@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + /* cap releases */ releases = 0; if (req->r_inode_drop) @@ -1561,7 +1564,7 @@ static void complete_request(struct ceph_mds_client *mdsc, if (req->r_callback) req->r_callback(mdsc, req); else - complete(&req->r_completion); + complete_all(&req->r_completion); } /* @@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + if (req->r_got_unsafe) { + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); + msg->front.iov_len = req->r_request_release_offset; + return 0; + } + if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; @@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; dout(" r_locked_dir = %p\n", req->r_locked_dir); - - if (req->r_target_inode && req->r_got_unsafe) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - else - rhead->ino = 0; return 0; } @@ -1907,7 +1932,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete(&req->r_safe_completion); + complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* @@ -1922,7 +1947,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) - complete(&mdsc->safe_umount_waiters); + complete_all(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; } @@ -2101,7 +2126,7 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete(&mdsc->session_close_waiters); + complete_all(&mdsc->session_close_waiters); kick_requests(mdsc, mds); break; @@ -2783,6 +2808,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) drop_leases(mdsc); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); + + /* + * wait for reply handlers to drop their request refs and + * their inode/dcache refs + */ + ceph_msgr_flush(); } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index b292fa4..952410c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -188,6 +188,7 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ + int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 64b8b1f..15167b2 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con); * nicely render a sockaddr as a string. */ #define MAX_ADDR_STR 20 -static char addr_str[MAX_ADDR_STR][40]; +#define MAX_ADDR_STR_LEN 60 +static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; @@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss) int i; char *s; struct sockaddr_in *in4 = (void *)ss; - unsigned char *quad = (void *)&in4->sin_addr.s_addr; struct sockaddr_in6 *in6 = (void *)ss; spin_lock(&addr_str_lock); @@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss) switch (ss->ss_family) { case AF_INET: - sprintf(s, "%u.%u.%u.%u:%u", - (unsigned int)quad[0], - (unsigned int)quad[1], - (unsigned int)quad[2], - (unsigned int)quad[3], - (unsigned int)ntohs(in4->sin_port)); + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, + (unsigned int)ntohs(in4->sin_port)); break; case AF_INET6: - sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", - in6->sin6_addr.s6_addr16[0], - in6->sin6_addr.s6_addr16[1], - in6->sin6_addr.s6_addr16[2], - in6->sin6_addr.s6_addr16[3], - in6->sin6_addr.s6_addr16[4], - in6->sin6_addr.s6_addr16[5], - in6->sin6_addr.s6_addr16[6], - in6->sin6_addr.s6_addr16[7], - (unsigned int)ntohs(in6->sin6_port)); + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, + (unsigned int)ntohs(in6->sin6_port)); break; default: @@ -215,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock, */ static struct socket *ceph_tcp_connect(struct ceph_connection *con) { - struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; + struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; int ret; BUG_ON(con->sock); - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); if (ret) return ERR_PTR(ret); con->sock = sock; @@ -234,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); - ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); + ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", pr_addr(&con->peer_addr.in_addr), @@ -657,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT; + con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1009,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end, struct sockaddr_in *in4 = (void *)ss; struct sockaddr_in6 *in6 = (void *)ss; int port; + char delim = ','; + + if (*p == '[') { + delim = ']'; + p++; + } memset(ss, 0, sizeof(*ss)); if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, - ',', &ipend)) { + delim, &ipend)) ss->ss_family = AF_INET; - } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, - ',', &ipend)) { + else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + delim, &ipend)) ss->ss_family = AF_INET6; - } else { + else goto bad; - } p = ipend; + if (delim == ']') { + if (*p != ']') { + dout("missing matching ']'\n"); + goto bad; + } + p++; + } + /* port? */ if (p < end && *p == ':') { port = 0; @@ -1055,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end, return 0; bad: - pr_err("parse_ips bad ip '%s'\n", c); + pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } @@ -1396,10 +1399,12 @@ static int read_partial_message(struct ceph_connection *con) if (!con->in_msg) { dout("got hdr type %d front %d data %d\n", con->in_hdr.type, con->in_hdr.front_len, con->in_hdr.data_len); + skip = 0; con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); + BUG_ON(con->in_msg); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; @@ -2013,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) { mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p\n", con, msg); + dout("con_revoke %p msg %p - was on queue\n", con, msg); list_del_init(&msg->list_head); ceph_msg_put(msg); msg->hdr.seq = 0; - if (con->out_msg == msg) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } + } + if (con->out_msg == msg) { + dout("con_revoke %p msg %p - was sending\n", con, msg); + con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - } else { - dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); + ceph_msg_put(msg); + msg->hdr.seq = 0; } mutex_unlock(&con->mutex); } diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 07a5399..54fe01c 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -345,7 +345,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, out: mutex_unlock(&monc->mutex); - wake_up(&client->auth_wq); + wake_up_all(&client->auth_wq); } /* @@ -462,7 +462,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, } mutex_unlock(&monc->mutex); if (req) { - complete(&req->completion); + complete_all(&req->completion); put_generic_request(req); } return; @@ -718,14 +718,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc, monc->m_auth->front_max); if (ret < 0) { monc->client->auth_err = ret; - wake_up(&monc->client->auth_wq); + wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - monc->client->msgr->inst.name.num = monc->auth->global_id; + monc->client->msgr->inst.name.num = + cpu_to_le64(monc->auth->global_id); __send_subscribe(monc); __resend_generic_request(monc); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index d25b4ad..e385223 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -862,12 +862,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, if (req->r_callback) req->r_callback(req, msg); else - complete(&req->r_completion); + complete_all(&req->r_completion); if (flags & CEPH_OSD_FLAG_ONDISK) { if (req->r_safe_callback) req->r_safe_callback(req, msg); - complete(&req->r_safe_completion); /* fsync waiter */ + complete_all(&req->r_safe_completion); /* fsync waiter */ } done: @@ -1083,7 +1083,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); - wake_up(&osdc->client->auth_wq); + wake_up_all(&osdc->client->auth_wq); return; bad: @@ -1344,7 +1344,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) int type = le16_to_cpu(msg->hdr.type); if (!osd) - return; + goto out; osdc = osd->o_osdc; switch (type) { @@ -1359,6 +1359,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } +out: ceph_msg_put(msg); } diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index ddc656f..416d46a 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); + kfree(pi); goto bad; } __decode_pool(p, pi); @@ -707,6 +708,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, newcrush = crush_decode(*p, min(*p+len, end)); if (IS_ERR(newcrush)) return ERR_CAST(newcrush); + *p += len; } /* new flags? */ @@ -829,12 +831,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* remove any? */ while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, node)->pgid, pgid) <= 0) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } if (pglen) { @@ -850,19 +853,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, for (j = 0; j < pglen; j++) pg->osds[j] = ceph_decode_32(p); err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) + if (err) { + kfree(pg); goto bad; + } dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, pglen); } } while (rbp) { - struct rb_node *cur = rbp; + struct ceph_pg_mapping *cur = + rb_entry(rbp, struct ceph_pg_mapping, node); + rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); + dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); + rb_erase(&cur->node, &map->pg_temp); + kfree(cur); } /* ignore the rest */ diff --git a/fs/char_dev.c b/fs/char_dev.c index d6db933..f80a4f2 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -20,6 +20,7 @@ #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> +#include <linux/tty.h> #include "internal.h" diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 80f3525..917b7d4 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -2,7 +2,6 @@ config CIFS tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS - select SLOW_WORK help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block @@ -71,14 +70,14 @@ config CIFS_WEAK_PW_HASH If unsure, say N. config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup" - depends on CIFS && KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Enables an upcall mechanism for CIFS which accesses userspace helper + utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets + which are needed to mount to certain secure servers (for which more + secure Kerberos authentication is required). If unsure, say N. config CIFS_XATTR bool "CIFS extended attributes" @@ -122,6 +121,7 @@ config CIFS_DEBUG2 config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS && KEYS + select DNS_RESOLVER help Distributed File System (DFS) support is used to access shares transparently in an enterprise name space, even if the share @@ -131,6 +131,15 @@ config CIFS_DFS_UPCALL IP addresses) which is needed for implicit mounts of DFS junction points. If unsure, say N. +config CIFS_FSCACHE + bool "Provide CIFS client caching support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + help + Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data + to be cached locally on disk through the general filesystem cache + manager. If unsure, say N. + config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 9948c003..adefa60 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -11,3 +11,5 @@ cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o + +cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o diff --git a/fs/cifs/README b/fs/cifs/README index a727b7c..a7081ee 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -568,8 +568,9 @@ module can be displayed via modinfo. Misc /proc/fs/cifs Flags and Debug Info ======================================= Informational pseudo-files: -DebugData Displays information about active CIFS sessions - and shares, as well as the cifs.ko version. +DebugData Displays information about active CIFS sessions and + shares, features enabled as well as the cifs.ko + version. Stats Lists summary resource usage information as well as per share statistics, if CONFIG_CIFS_STATS in enabled in the kernel configuration. diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c new file mode 100644 index 0000000..224d7bbd --- /dev/null +++ b/fs/cifs/cache.c @@ -0,0 +1,331 @@ +/* + * fs/cifs/cache.c - CIFS filesystem cache index structure definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifs_debug.h" + +/* + * CIFS filesystem definition for FS-Cache + */ +struct fscache_netfs cifs_fscache_netfs = { + .name = "cifs", + .version = 0, +}; + +/* + * Register CIFS for caching with FS-Cache + */ +int cifs_fscache_register(void) +{ + return fscache_register_netfs(&cifs_fscache_netfs); +} + +/* + * Unregister CIFS for caching + */ +void cifs_fscache_unregister(void) +{ + fscache_unregister_netfs(&cifs_fscache_netfs); +} + +/* + * Key layout of CIFS server cache index object + */ +struct cifs_server_key { + uint16_t family; /* address family */ + uint16_t port; /* IP port */ + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + } addr[0]; +}; + +/* + * Server object keyed by {IPaddress,port,family} tuple + */ +static uint16_t cifs_server_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct TCP_Server_Info *server = cookie_netfs_data; + const struct sockaddr *sa = (struct sockaddr *) &server->addr.sockAddr; + struct cifs_server_key *key = buffer; + uint16_t key_len = sizeof(struct cifs_server_key); + + memset(key, 0, key_len); + + /* + * Should not be a problem as sin_family/sin6_family overlays + * sa_family field + */ + switch (sa->sa_family) { + case AF_INET: + key->family = server->addr.sockAddr.sin_family; + key->port = server->addr.sockAddr.sin_port; + key->addr[0].ipv4_addr = server->addr.sockAddr.sin_addr; + key_len += sizeof(key->addr[0].ipv4_addr); + break; + + case AF_INET6: + key->family = server->addr.sockAddr6.sin6_family; + key->port = server->addr.sockAddr6.sin6_port; + key->addr[0].ipv6_addr = server->addr.sockAddr6.sin6_addr; + key_len += sizeof(key->addr[0].ipv6_addr); + break; + + default: + cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family); + key_len = 0; + break; + } + + return key_len; +} + +/* + * Server object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_server_index_def = { + .name = "CIFS.server", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_server_get_key, +}; + +/* + * Auxiliary data attached to CIFS superblock within the cache + */ +struct cifs_fscache_super_auxdata { + u64 resource_id; /* unique server resource id */ +}; + +static char *extract_sharename(const char *treename) +{ + const char *src; + char *delim, *dst; + int len; + + /* skip double chars at the beginning */ + src = treename + 2; + + /* share name is always preceded by '\\' now */ + delim = strchr(src, '\\'); + if (!delim) + return ERR_PTR(-EINVAL); + delim++; + len = strlen(delim); + + /* caller has to free the memory */ + dst = kstrndup(delim, len, GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + return dst; +} + +/* + * Superblock object currently keyed by share name + */ +static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + const struct cifsTconInfo *tcon = cookie_netfs_data; + char *sharename; + uint16_t len; + + sharename = extract_sharename(tcon->treeName); + if (IS_ERR(sharename)) { + cFYI(1, "CIFS: couldn't extract sharename\n"); + sharename = NULL; + return 0; + } + + len = strlen(sharename); + if (len > maxbuf) + return 0; + + memcpy(buffer, sharename, len); + + kfree(sharename); + + return len; +} + +static uint16_t +cifs_fscache_super_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifsTconInfo *tcon = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_super_auxdata auxdata; + const struct cifsTconInfo *tcon = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.resource_id = tcon->resource_id; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +/* + * Superblock object for FS-Cache + */ +const struct fscache_cookie_def cifs_fscache_super_index_def = { + .name = "CIFS.super", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = cifs_super_get_key, + .get_aux = cifs_fscache_super_get_aux, + .check_aux = cifs_fscache_super_check_aux, +}; + +/* + * Auxiliary data attached to CIFS inode within the cache + */ +struct cifs_fscache_inode_auxdata { + struct timespec last_write_time; + struct timespec last_change_time; + u64 eof; +}; + +static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + uint16_t keylen; + + /* use the UniqueId as the key */ + keylen = sizeof(cifsi->uniqueid); + if (keylen > maxbuf) + keylen = 0; + else + memcpy(buffer, &cifsi->uniqueid, keylen); + + return keylen; +} + +static void +cifs_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) +{ + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + *size = cifsi->vfs_inode.i_size; +} + +static uint16_t +cifs_fscache_inode_get_aux(const void *cookie_netfs_data, void *buffer, + uint16_t maxbuf) +{ + struct cifs_fscache_inode_auxdata auxdata; + const struct cifsInodeInfo *cifsi = cookie_netfs_data; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (maxbuf > sizeof(auxdata)) + maxbuf = sizeof(auxdata); + + memcpy(buffer, &auxdata, maxbuf); + + return maxbuf; +} + +static enum +fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, + const void *data, + uint16_t datalen) +{ + struct cifs_fscache_inode_auxdata auxdata; + struct cifsInodeInfo *cifsi = cookie_netfs_data; + + if (datalen != sizeof(auxdata)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time = cifsi->vfs_inode.i_mtime; + auxdata.last_change_time = cifsi->vfs_inode.i_ctime; + + if (memcmp(data, &auxdata, datalen) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + return FSCACHE_CHECKAUX_OKAY; +} + +static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) +{ + struct cifsInodeInfo *cifsi = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + cFYI(1, "cifs inode 0x%p now uncached", cifsi); + + for (;;) { + nr_pages = pagevec_lookup(&pvec, + cifsi->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +const struct fscache_cookie_def cifs_fscache_inode_object_def = { + .name = "CIFS.uniqueid", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = cifs_fscache_inode_get_key, + .get_attr = cifs_fscache_inode_get_attr, + .get_aux = cifs_fscache_inode_get_aux, + .check_aux = cifs_fscache_inode_check_aux, + .now_uncached = cifs_fscache_inode_now_uncached, +}; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 4fce6e6..eb1ba49 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -119,6 +119,31 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) "Display Internal CIFS Data Structures for Debugging\n" "---------------------------------------------------\n"); seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); + seq_printf(m, "Features: "); +#ifdef CONFIG_CIFS_DFS_UPCALL + seq_printf(m, "dfs"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_FSCACHE + seq_printf(m, "fscache"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_WEAK_PW_HASH + seq_printf(m, "lanman"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_POSIX + seq_printf(m, "posix"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_UPCALL + seq_printf(m, "spnego"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_XATTR + seq_printf(m, "xattr"); +#endif + seq_putc(m, '\n'); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); seq_printf(m, "Servers:"); diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ac19a6f..d6ced7a 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -141,7 +141,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, } rc = dns_resolve_server_name_to_ip(*devname, &srvIP); - if (rc != 0) { + if (rc < 0) { cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", __func__, *devname, rc); goto compose_mount_options_err; @@ -150,8 +150,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, * assuming that we have 'unc=' and 'ip=' in * the original sb_mountdata */ - md_len = strlen(sb_mountdata) + strlen(srvIP) + - strlen(ref->node_name) + 12; + md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; mountdata = kzalloc(md_len+1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; @@ -230,28 +229,22 @@ compose_mount_options_err: goto compose_mount_options_out; } - -static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent, - struct dentry *dentry, const struct dfs_info3_param *ref) +/** + * cifs_dfs_do_refmount - mounts specified path using provided refferal + * @cifs_sb: parent/root superblock + * @fullpath: full path in UNC format + * @ref: server's referral + */ +static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb, + const char *fullpath, const struct dfs_info3_param *ref) { - struct cifs_sb_info *cifs_sb; struct vfsmount *mnt; char *mountdata; char *devname = NULL; - char *fullpath; - - cifs_sb = CIFS_SB(dentry->d_inode->i_sb); - /* - * this function gives us a path with a double backslash prefix. We - * require a single backslash for DFS. - */ - fullpath = build_path_from_dentry(dentry); - if (!fullpath) - return ERR_PTR(-ENOMEM); + /* strip first '\' from fullpath */ mountdata = cifs_compose_mount_options(cifs_sb->mountdata, fullpath + 1, ref, &devname); - kfree(fullpath); if (IS_ERR(mountdata)) return (struct vfsmount *)mountdata; @@ -357,8 +350,8 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) rc = -EINVAL; goto out_err; } - mnt = cifs_dfs_do_refmount(nd->path.mnt, - nd->path.dentry, referrals + i); + mnt = cifs_dfs_do_refmount(cifs_sb, + full_path, referrals + i); cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__, referrals[i].node_name, mnt); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 246a167..9e77145 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -35,6 +35,7 @@ #define CIFS_MOUNT_DYNPERM 0x1000 /* allow in-memory only mode setting */ #define CIFS_MOUNT_NOPOSIXBRL 0x2000 /* mandatory not posix byte range lock */ #define CIFS_MOUNT_NOSSYNC 0x4000 /* don't do slow SMBflush on every sync*/ +#define CIFS_MOUNT_FSCACHE 0x8000 /* local caching enabled */ struct cifs_sb_info { struct cifsTconInfo *tcon; /* primary mount */ diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 379bd7d..8704490 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = { /* strlen of ";uid=0x" */ #define UID_KEY_LEN 7 +/* strlen of ";creduid=0x" */ +#define CREDUID_KEY_LEN 11 + /* strlen of ";user=" */ #define USER_KEY_LEN 6 @@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) IP_KEY_LEN + INET6_ADDRSTRLEN + MAX_MECH_STR_LEN + UID_KEY_LEN + (sizeof(uid_t) * 2) + + CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + USER_KEY_LEN + strlen(sesInfo->userName) + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; @@ -144,6 +148,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) sprintf(dp, ";uid=0x%x", sesInfo->linux_uid); dp = description + strlen(description); + sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); + + dp = description + strlen(description); sprintf(dp, ";user=%s", sesInfo->userName); dp = description + strlen(description); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 484e52b..a5ed10c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -45,8 +45,8 @@ #include "cifs_fs_sb.h" #include <linux/mm.h> #include <linux/key-type.h> -#include "dns_resolve.h" #include "cifs_spnego.h" +#include "fscache.h" #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ int cifsFYI = 0; @@ -329,6 +329,12 @@ cifs_destroy_inode(struct inode *inode) } static void +cifs_clear_inode(struct inode *inode) +{ + cifs_fscache_release_inode_cookie(inode); +} + +static void cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) { seq_printf(s, ",addr="); @@ -489,6 +495,7 @@ static const struct super_operations cifs_super_ops = { .alloc_inode = cifs_alloc_inode, .destroy_inode = cifs_destroy_inode, .drop_inode = cifs_drop_inode, + .clear_inode = cifs_clear_inode, /* .delete_inode = cifs_delete_inode, */ /* Do not need above function unless later we add lazy close of inodes or unless the kernel forgets to call us with the same number of releases (closes) @@ -902,6 +909,10 @@ init_cifs(void) cFYI(1, "cifs_max_pending set to max of 256"); } + rc = cifs_fscache_register(); + if (rc) + goto out; + rc = cifs_init_inodecache(); if (rc) goto out_clean_proc; @@ -922,27 +933,13 @@ init_cifs(void) if (rc) goto out_unregister_filesystem; #endif -#ifdef CONFIG_CIFS_DFS_UPCALL - rc = register_key_type(&key_type_dns_resolver); - if (rc) - goto out_unregister_key_type; -#endif - rc = slow_work_register_user(THIS_MODULE); - if (rc) - goto out_unregister_resolver_key; return 0; - out_unregister_resolver_key: -#ifdef CONFIG_CIFS_DFS_UPCALL - unregister_key_type(&key_type_dns_resolver); - out_unregister_key_type: -#endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); out_unregister_filesystem: -#endif unregister_filesystem(&cifs_fs_type); +#endif out_destroy_request_bufs: cifs_destroy_request_bufs(); out_destroy_mids: @@ -951,6 +948,8 @@ init_cifs(void) cifs_destroy_inodecache(); out_clean_proc: cifs_proc_clean(); + cifs_fscache_unregister(); + out: return rc; } @@ -959,9 +958,9 @@ exit_cifs(void) { cFYI(DBG2, "exit_cifs"); cifs_proc_clean(); + cifs_fscache_unregister(); #ifdef CONFIG_CIFS_DFS_UPCALL cifs_dfs_release_automount_timer(); - unregister_key_type(&key_type_dns_resolver); #endif #ifdef CONFIG_CIFS_UPCALL unregister_key_type(&cifs_spnego_key_type); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index a7eb65c..d82f5fb 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.64" +#define CIFS_VERSION "1.65" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index a88479c..0cdfb8c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -16,10 +16,13 @@ * the GNU Lesser General Public License for more details. * */ +#ifndef _CIFS_GLOB_H +#define _CIFS_GLOB_H + #include <linux/in.h> #include <linux/in6.h> #include <linux/slab.h> -#include <linux/slow-work.h> +#include <linux/workqueue.h> #include "cifs_fs_sb.h" #include "cifsacl.h" /* @@ -34,7 +37,7 @@ #define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */ #define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null termination then *2 for unicode versions */ -#define MAX_PASSWORD_SIZE 16 +#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ #define CIFS_MIN_RCV_POOL 4 @@ -80,8 +83,7 @@ enum statusEnum { }; enum securityEnum { - PLAINTXT = 0, /* Legacy with Plaintext passwords */ - LANMAN, /* Legacy LANMAN auth */ + LANMAN = 0, /* Legacy LANMAN auth */ NTLM, /* Legacy NTLM012 auth with NTLM hash */ NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ @@ -142,7 +144,6 @@ struct TCP_Server_Info { struct list_head pending_mid_q; void *Server_NlsInfo; /* BB - placeholder for future NLS info */ unsigned short server_codepage; /* codepage for the server */ - unsigned long ip_address; /* IP addr for the server if known */ enum protocolEnum protocolType; char versionMajor; char versionMinor; @@ -190,19 +191,9 @@ struct TCP_Server_Info { bool sec_mskerberos; /* supports legacy MS Kerberos */ bool sec_kerberosu2u; /* supports U2U Kerberos */ bool sec_ntlmssp; /* supports NTLMSSP */ -}; - -/* - * The following is our shortcut to user information. We surface the uid, - * and name. We always get the password on the fly in case it - * has changed. We also hang a list of sessions owned by this user off here. - */ -struct cifsUidInfo { - struct list_head userList; - struct list_head sessionList; /* SMB sessions for this user */ - uid_t linux_uid; - char user[MAX_USERNAME_SIZE + 1]; /* ascii name of user */ - /* BB may need ptr or callback for PAM or WinBind info */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; /* client index cache cookie */ +#endif }; /* @@ -212,9 +203,6 @@ struct cifsSesInfo { struct list_head smb_ses_list; struct list_head tcon_list; struct mutex session_mutex; -#if 0 - struct cifsUidInfo *uidInfo; /* pointer to user info */ -#endif struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ enum statusEnum status; @@ -226,7 +214,8 @@ struct cifsSesInfo { char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ int Suid; /* remote smb uid */ - uid_t linux_uid; /* local Linux uid */ + uid_t linux_uid; /* overriding owner of files on the mount */ + uid_t cred_uid; /* owner of credentials */ int capabilities; char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for TCP names - will ipv6 and sctp addresses fit? */ @@ -311,6 +300,10 @@ struct cifsTconInfo { bool local_lease:1; /* check leases (only) on local system not remote */ bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ bool need_reconnect:1; /* connection reset, tid now invalid */ +#ifdef CONFIG_CIFS_FSCACHE + u64 resource_id; /* server resource id */ + struct fscache_cookie *fscache; /* cookie for share */ +#endif /* BB add field for back pointer to sb struct(s)? */ }; @@ -363,7 +356,7 @@ struct cifsFileInfo { atomic_t count; /* reference count */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; - struct slow_work oplock_break; /* slow_work job for oplock breaks */ + struct work_struct oplock_break; /* work for oplock breaks */ }; /* Take a reference on the file private data */ @@ -398,6 +391,9 @@ struct cifsInodeInfo { bool invalid_mapping:1; /* pagecache is invalid */ u64 server_eof; /* current file size on server */ u64 uniqueid; /* server inode number */ +#ifdef CONFIG_CIFS_FSCACHE + struct fscache_cookie *fscache; +#endif struct inode vfs_inode; }; @@ -732,4 +728,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ +void cifs_oplock_break(struct work_struct *work); +void cifs_oplock_break_get(struct cifsFileInfo *cfile); +void cifs_oplock_break_put(struct cifsFileInfo *cfile); + extern const struct slow_work_ops cifs_oplock_break_ops; + +#endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index fb6318b..1f54508 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -86,7 +86,9 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr); extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); -extern int cifs_convert_address(char *src, void *dst); +extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); +extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, + unsigned short int port); extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifsTconInfo *, int /* length of diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2208f06..95c2ea6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -48,6 +48,7 @@ #include "nterr.h" #include "rfc1002pdu.h" #include "cn_cifs.h" +#include "fscache.h" #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -66,6 +67,7 @@ struct smb_vol { char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[16]; /* netbios name of client */ char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ + uid_t cred_uid; uid_t linux_uid; gid_t linux_gid; mode_t file_mode; @@ -97,6 +99,7 @@ struct smb_vol { bool noblocksnd:1; bool noautotune:1; bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ + bool fsc:1; /* enable fscache */ unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -830,7 +833,8 @@ cifs_parse_mount_options(char *options, const char *devname, /* null target name indicates to use *SMBSERVR default called name if we end up sending RFC1001 session initialize */ vol->target_rfc1001_name[0] = 0; - vol->linux_uid = current_uid(); /* use current_euid() instead? */ + vol->cred_uid = current_uid(); + vol->linux_uid = current_uid(); vol->linux_gid = current_gid(); /* default to only allowing write access to owner of the mount */ @@ -1257,6 +1261,12 @@ cifs_parse_mount_options(char *options, const char *devname, } else if ((strnicmp(data, "nocase", 6) == 0) || (strnicmp(data, "ignorecase", 10) == 0)) { vol->nocase = 1; + } else if (strnicmp(data, "mand", 4) == 0) { + /* ignore */ + } else if (strnicmp(data, "nomand", 6) == 0) { + /* ignore */ + } else if (strnicmp(data, "_netdev", 7) == 0) { + /* ignore */ } else if (strnicmp(data, "brl", 3) == 0) { vol->nobrl = 0; } else if ((strnicmp(data, "nobrl", 5) == 0) || @@ -1331,6 +1341,8 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_WARNING "CIFS: Mount option noac not " "supported. Instead set " "/proc/fs/cifs/LookupCacheEnabled to 0\n"); + } else if (strnicmp(data, "fsc", 3) == 0) { + vol->fsc = true; } else printk(KERN_WARNING "CIFS: Unknown mount option %s\n", data); @@ -1380,18 +1392,92 @@ cifs_parse_mount_options(char *options, const char *devname, return 0; } +static bool +match_address(struct TCP_Server_Info *server, struct sockaddr *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + + switch (addr->sa_family) { + case AF_INET: + if (addr4->sin_addr.s_addr != + server->addr.sockAddr.sin_addr.s_addr) + return false; + if (addr4->sin_port && + addr4->sin_port != server->addr.sockAddr.sin_port) + return false; + break; + case AF_INET6: + if (!ipv6_addr_equal(&addr6->sin6_addr, + &server->addr.sockAddr6.sin6_addr)) + return false; + if (addr6->sin6_scope_id != + server->addr.sockAddr6.sin6_scope_id) + return false; + if (addr6->sin6_port && + addr6->sin6_port != server->addr.sockAddr6.sin6_port) + return false; + break; + } + + return true; +} + +static bool +match_security(struct TCP_Server_Info *server, struct smb_vol *vol) +{ + unsigned int secFlags; + + if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) + secFlags = vol->secFlg; + else + secFlags = global_secflags | vol->secFlg; + + switch (server->secType) { + case LANMAN: + if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT))) + return false; + break; + case NTLMv2: + if (!(secFlags & CIFSSEC_MAY_NTLMV2)) + return false; + break; + case NTLM: + if (!(secFlags & CIFSSEC_MAY_NTLM)) + return false; + break; + case Kerberos: + if (!(secFlags & CIFSSEC_MAY_KRB5)) + return false; + break; + case RawNTLMSSP: + if (!(secFlags & CIFSSEC_MAY_NTLMSSP)) + return false; + break; + default: + /* shouldn't happen */ + return false; + } + + /* now check if signing mode is acceptible */ + if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && + (server->secMode & SECMODE_SIGN_REQUIRED)) + return false; + else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) && + (server->secMode & + (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0) + return false; + + return true; +} + static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) +cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) { - struct list_head *tmp; struct TCP_Server_Info *server; - struct sockaddr_in *addr4 = (struct sockaddr_in *) addr; - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr; write_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &cifs_tcp_ses_list) { - server = list_entry(tmp, struct TCP_Server_Info, - tcp_ses_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { /* * the demux thread can exit on its own while still in CifsNew * so don't accept any sockets in that state. Since the @@ -1401,37 +1487,11 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) if (server->tcpStatus == CifsNew) continue; - switch (addr->ss_family) { - case AF_INET: - if (addr4->sin_addr.s_addr == - server->addr.sockAddr.sin_addr.s_addr) { - addr4->sin_port = htons(port); - /* user overrode default port? */ - if (addr4->sin_port) { - if (addr4->sin_port != - server->addr.sockAddr.sin_port) - continue; - } - break; - } else - continue; + if (!match_address(server, addr)) + continue; - case AF_INET6: - if (ipv6_addr_equal(&addr6->sin6_addr, - &server->addr.sockAddr6.sin6_addr) && - (addr6->sin6_scope_id == - server->addr.sockAddr6.sin6_scope_id)) { - addr6->sin6_port = htons(port); - /* user overrode default port? */ - if (addr6->sin6_port) { - if (addr6->sin6_port != - server->addr.sockAddr6.sin6_port) - continue; - } - break; - } else - continue; - } + if (!match_security(server, vol)) + continue; ++server->srv_count; write_unlock(&cifs_tcp_ses_lock); @@ -1460,6 +1520,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); + cifs_fscache_release_client_cookie(server); + task = xchg(&server->tsk, NULL); if (task) force_sig(SIGKILL, task); @@ -1479,7 +1541,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip); if (volume_info->UNCip && volume_info->UNC) { - rc = cifs_convert_address(volume_info->UNCip, &addr); + rc = cifs_fill_sockaddr((struct sockaddr *)&addr, + volume_info->UNCip, + strlen(volume_info->UNCip), + volume_info->port); if (!rc) { /* we failed translating address */ rc = -EINVAL; @@ -1499,7 +1564,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); + tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); if (tcp_ses) return tcp_ses; @@ -1543,12 +1608,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cFYI(1, "attempting ipv6 connect"); /* BB should we allow ipv6 on port 139? */ /* other OS never observed in Wild doing 139 with v6 */ - sin_server6->sin6_port = htons(volume_info->port); memcpy(&tcp_ses->addr.sockAddr6, sin_server6, sizeof(struct sockaddr_in6)); rc = ipv6_connect(tcp_ses); } else { - sin_server->sin_port = htons(volume_info->port); memcpy(&tcp_ses->addr.sockAddr, sin_server, sizeof(struct sockaddr_in)); rc = ipv4_connect(tcp_ses); @@ -1577,6 +1640,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info) list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); write_unlock(&cifs_tcp_ses_lock); + cifs_fscache_get_client_cookie(tcp_ses); + return tcp_ses; out_err: @@ -1591,17 +1656,27 @@ out_err: } static struct cifsSesInfo * -cifs_find_smb_ses(struct TCP_Server_Info *server, char *username) +cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) { - struct list_head *tmp; struct cifsSesInfo *ses; write_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list); - if (strncmp(ses->userName, username, MAX_USERNAME_SIZE)) - continue; - + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + switch (server->secType) { + case Kerberos: + if (vol->cred_uid != ses->cred_uid) + continue; + break; + default: + /* anything else takes username/password */ + if (strncmp(ses->userName, vol->username, + MAX_USERNAME_SIZE)) + continue; + if (strlen(vol->username) != 0 && + strncmp(ses->password, vol->password, + MAX_PASSWORD_SIZE)) + continue; + } ++ses->ses_count; write_unlock(&cifs_tcp_ses_lock); return ses; @@ -1643,7 +1718,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) xid = GetXid(); - ses = cifs_find_smb_ses(server, volume_info->username); + ses = cifs_find_smb_ses(server, volume_info); if (ses) { cFYI(1, "Existing smb sess found (status=%d)", ses->status); @@ -1706,6 +1781,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (ses->domainName) strcpy(ses->domainName, volume_info->domainname); } + ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; ses->overrideSecFlg = volume_info->secFlg; @@ -1773,6 +1849,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon) CIFSSMBTDis(xid, tcon); _FreeXid(xid); + cifs_fscache_release_super_cookie(tcon); tconInfoFree(tcon); cifs_put_smb_ses(ses); } @@ -1843,6 +1920,8 @@ cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info) list_add(&tcon->tcon_list, &ses->tcon_list); write_unlock(&cifs_tcp_ses_lock); + cifs_fscache_get_super_cookie(tcon); + return tcon; out_fail: @@ -2397,6 +2476,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; if (pvolume_info->dynperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; + if (pvolume_info->fsc) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; if (pvolume_info->direct_io) { cFYI(1, "mounting share using direct i/o"); cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index e7ae78b..578d88c 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -130,12 +130,6 @@ cifs_bp_rename_retry: return full_path; } -/* - * When called with struct file pointer set to NULL, there is no way we could - * update file->private_data, but getting it stuck on openFileList provides a - * way to access it from cifs_fill_filedata and thereby set file->private_data - * from cifs_open. - */ struct cifsFileInfo * cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file, struct vfsmount *mnt, unsigned int oflags) @@ -163,7 +157,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); atomic_set(&pCifsFile->count, 1); - slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops); + INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); write_lock(&GlobalSMBSeslock); list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList); diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 4db2c5e..0eb8702 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -4,6 +4,8 @@ * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) * * Contains the CIFS DFS upcall routines used for hostname to * IP address translation. @@ -24,145 +26,73 @@ */ #include <linux/slab.h> -#include <keys/user-type.h> +#include <linux/dns_resolver.h> #include "dns_resolve.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" -/* Checks if supplied name is IP address - * returns: - * 1 - name is IP - * 0 - name is not IP - */ -static int -is_ip(char *name) -{ - struct sockaddr_storage ss; - - return cifs_convert_address(name, &ss); -} - -static int -dns_resolver_instantiate(struct key *key, const void *data, - size_t datalen) -{ - int rc = 0; - char *ip; - - ip = kmalloc(datalen + 1, GFP_KERNEL); - if (!ip) - return -ENOMEM; - - memcpy(ip, data, datalen); - ip[datalen] = '\0'; - - /* make sure this looks like an address */ - if (!is_ip(ip)) { - kfree(ip); - return -EINVAL; - } - - key->type_data.x[0] = datalen; - key->payload.data = ip; - - return rc; -} - -static void -dns_resolver_destroy(struct key *key) -{ - kfree(key->payload.data); -} - -struct key_type key_type_dns_resolver = { - .name = "dns_resolver", - .def_datalen = sizeof(struct in_addr), - .describe = user_describe, - .instantiate = dns_resolver_instantiate, - .destroy = dns_resolver_destroy, - .match = user_match, -}; - -/* Resolves server name to ip address. - * input: - * unc - server UNC - * output: - * *ip_addr - pointer to server ip, caller responcible for freeing it. - * return 0 on success +/** + * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. + * @unc: UNC path specifying the server + * @ip_addr: Where to return the IP address. + * + * The IP address will be returned in string form, and the caller is + * responsible for freeing it. + * + * Returns length of result on success, -ve on error. */ int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) { - int rc = -EAGAIN; - struct key *rkey = ERR_PTR(-EAGAIN); + struct sockaddr_storage ss; + const char *hostname, *sep; char *name; - char *data = NULL; - int len; + int len, rc; if (!ip_addr || !unc) return -EINVAL; - /* search for server name delimiter */ len = strlen(unc); if (len < 3) { cFYI(1, "%s: unc is too short: %s", __func__, unc); return -EINVAL; } + + /* Discount leading slashes for cifs */ len -= 2; - name = memchr(unc+2, '\\', len); - if (!name) { + hostname = unc + 2; + + /* Search for server name delimiter */ + sep = memchr(hostname, '\\', len); + if (sep) + len = sep - unc; + else cFYI(1, "%s: probably server name is whole unc: %s", - __func__, unc); - } else { - len = (name - unc) - 2/* leading // */; - } + __func__, unc); + + /* Try to interpret hostname as an IPv4 or IPv6 address */ + rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len); + if (rc > 0) + goto name_is_IP_address; + + /* Perform the upcall */ + rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); + if (rc < 0) + cERROR(1, "%s: unable to resolve: %*.*s", + __func__, len, len, hostname); + else + cFYI(1, "%s: resolved: %*.*s to %s", + __func__, len, len, hostname, *ip_addr); + return rc; - name = kmalloc(len+1, GFP_KERNEL); - if (!name) { - rc = -ENOMEM; - return rc; - } - memcpy(name, unc+2, len); +name_is_IP_address: + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + memcpy(name, hostname, len); name[len] = 0; - - if (is_ip(name)) { - cFYI(1, "%s: it is IP, skipping dns upcall: %s", - __func__, name); - data = name; - goto skip_upcall; - } - - rkey = request_key(&key_type_dns_resolver, name, ""); - if (!IS_ERR(rkey)) { - len = rkey->type_data.x[0]; - data = rkey->payload.data; - } else { - cERROR(1, "%s: unable to resolve: %s", __func__, name); - goto out; - } - -skip_upcall: - if (data) { - *ip_addr = kmalloc(len + 1, GFP_KERNEL); - if (*ip_addr) { - memcpy(*ip_addr, data, len + 1); - if (!IS_ERR(rkey)) - cFYI(1, "%s: resolved: %s to %s", __func__, - name, - *ip_addr - ); - rc = 0; - } else { - rc = -ENOMEM; - } - if (!IS_ERR(rkey)) - key_put(rkey); - } - -out: - kfree(name); - return rc; + cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name); + *ip_addr = name; + return 0; } - - diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index 966e928..d3f5d27 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -24,8 +24,6 @@ #define _DNS_RESOLVE_H #ifdef __KERNEL__ -#include <linux/key-type.h> -extern struct key_type key_type_dns_resolver; extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); #endif /* KERNEL */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 409e4f5..db11fde 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -40,6 +40,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "fscache.h" static inline int cifs_convert_flags(unsigned int flags) { @@ -282,6 +283,9 @@ int cifs_open(struct inode *inode, struct file *file) CIFSSMBClose(xid, tcon, netfid); rc = -ENOMEM; } + + cifs_fscache_set_inode_cookie(inode, file); + goto out; } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) @@ -373,6 +377,8 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } + cifs_fscache_set_inode_cookie(inode, file); + if (oplock & CIFS_CREATE_ACTION) { /* time to set mode which we can not set earlier due to problems creating new read-only files */ @@ -427,7 +433,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush) __u16 netfid; if (file->private_data) - pCifsFile = (struct cifsFileInfo *)file->private_data; + pCifsFile = file->private_data; else return -EBADF; @@ -565,8 +571,7 @@ int cifs_close(struct inode *inode, struct file *file) int xid, timeout; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - struct cifsFileInfo *pSMBFile = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *pSMBFile = file->private_data; xid = GetXid(); @@ -641,8 +646,7 @@ int cifs_closedir(struct inode *inode, struct file *file) { int rc = 0; int xid; - struct cifsFileInfo *pCFileStruct = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *pCFileStruct = file->private_data; char *ptmp; cFYI(1, "Closedir inode = 0x%p", inode); @@ -863,8 +867,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) length, pfLock, posix_lock_type, wait_flag); } else { - struct cifsFileInfo *fid = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *fid = file->private_data; if (numLock) { rc = CIFSSMBLock(xid, tcon, netfid, length, @@ -965,7 +968,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, if (file->private_data == NULL) return -EBADF; - open_file = (struct cifsFileInfo *) file->private_data; + open_file = file->private_data; rc = generic_write_checks(file, poffset, &write_size, 0); if (rc) @@ -1067,7 +1070,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data, if (file->private_data == NULL) return -EBADF; - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; xid = GetXid(); @@ -1651,8 +1654,7 @@ int cifs_fsync(struct file *file, int datasync) int xid; int rc = 0; struct cifsTconInfo *tcon; - struct cifsFileInfo *smbfile = - (struct cifsFileInfo *)file->private_data; + struct cifsFileInfo *smbfile = file->private_data; struct inode *inode = file->f_path.dentry->d_inode; xid = GetXid(); @@ -1756,7 +1758,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; if ((file->f_flags & O_ACCMODE) == O_WRONLY) cFYI(1, "attempting read on write only file instance"); @@ -1837,7 +1839,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; if ((file->f_flags & O_ACCMODE) == O_WRONLY) cFYI(1, "attempting read on write only file instance"); @@ -1942,6 +1944,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping, SetPageUptodate(page); unlock_page(page); data += PAGE_CACHE_SIZE; + + /* add page to FS-Cache */ + cifs_readpage_to_fscache(mapping->host, page); } return; } @@ -1968,10 +1973,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, FreeXid(xid); return rc; } - open_file = (struct cifsFileInfo *)file->private_data; + open_file = file->private_data; cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); pTcon = cifs_sb->tcon; + /* + * Reads as many pages as possible from fscache. Returns -ENOBUFS + * immediately if the cookie is negative + */ + rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, + &num_pages); + if (rc == 0) + goto read_complete; + cFYI(DBG2, "rpages: num pages %d", num_pages); for (i = 0; i < num_pages; ) { unsigned contig_pages; @@ -2082,6 +2096,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, smb_read_data = NULL; } +read_complete: FreeXid(xid); return rc; } @@ -2092,6 +2107,11 @@ static int cifs_readpage_worker(struct file *file, struct page *page, char *read_data; int rc; + /* Is the page cached? */ + rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page); + if (rc == 0) + goto read_complete; + page_cache_get(page); read_data = kmap(page); /* for reads over a certain size could initiate async read ahead */ @@ -2111,11 +2131,17 @@ static int cifs_readpage_worker(struct file *file, struct page *page, flush_dcache_page(page); SetPageUptodate(page); + + /* send this page to the cache */ + cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page); + rc = 0; io_error: kunmap(page); page_cache_release(page); + +read_complete: return rc; } @@ -2265,8 +2291,23 @@ out: return rc; } -static void -cifs_oplock_break(struct slow_work *work) +static int cifs_release_page(struct page *page, gfp_t gfp) +{ + if (PagePrivate(page)) + return 0; + + return cifs_fscache_release_page(page, gfp); +} + +static void cifs_invalidate_page(struct page *page, unsigned long offset) +{ + struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); + + if (offset == 0) + cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); +} + +void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); @@ -2303,33 +2344,30 @@ cifs_oplock_break(struct slow_work *work) LOCKING_ANDX_OPLOCK_RELEASE, false); cFYI(1, "Oplock release rc = %d", rc); } + + /* + * We might have kicked in before is_valid_oplock_break() + * finished grabbing reference for us. Make sure it's done by + * waiting for GlobalSMSSeslock. + */ + write_lock(&GlobalSMBSeslock); + write_unlock(&GlobalSMBSeslock); + + cifs_oplock_break_put(cfile); } -static int -cifs_oplock_break_get(struct slow_work *work) +void cifs_oplock_break_get(struct cifsFileInfo *cfile) { - struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, - oplock_break); mntget(cfile->mnt); cifsFileInfo_get(cfile); - return 0; } -static void -cifs_oplock_break_put(struct slow_work *work) +void cifs_oplock_break_put(struct cifsFileInfo *cfile) { - struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, - oplock_break); mntput(cfile->mnt); cifsFileInfo_put(cfile); } -const struct slow_work_ops cifs_oplock_break_ops = { - .get_ref = cifs_oplock_break_get, - .put_ref = cifs_oplock_break_put, - .execute = cifs_oplock_break, -}; - const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, .readpages = cifs_readpages, @@ -2338,6 +2376,8 @@ const struct address_space_operations cifs_addr_ops = { .write_begin = cifs_write_begin, .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ }; @@ -2354,6 +2394,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .write_begin = cifs_write_begin, .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, + .releasepage = cifs_release_page, + .invalidatepage = cifs_invalidate_page, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ }; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c new file mode 100644 index 0000000..9f3f5c4 --- /dev/null +++ b/fs/cifs/fscache.c @@ -0,0 +1,236 @@ +/* + * fs/cifs/fscache.c - CIFS filesystem cache interface + * + * Copyright (c) 2010 Novell, Inc. + * Author(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "fscache.h" +#include "cifsglob.h" +#include "cifs_debug.h" +#include "cifs_fs_sb.h" + +void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) +{ + server->fscache = + fscache_acquire_cookie(cifs_fscache_netfs.primary_index, + &cifs_fscache_server_index_def, server); + cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server, + server->fscache); +} + +void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) +{ + cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server, + server->fscache); + fscache_relinquish_cookie(server->fscache, 0); + server->fscache = NULL; +} + +void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) +{ + struct TCP_Server_Info *server = tcon->ses->server; + + tcon->fscache = + fscache_acquire_cookie(server->fscache, + &cifs_fscache_super_index_def, tcon); + cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)", + server->fscache, tcon->fscache); +} + +void cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) +{ + cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache); + fscache_relinquish_cookie(tcon->fscache, 0); + tcon->fscache = NULL; +} + +static void cifs_fscache_enable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + if (cifsi->fscache) + return; + + cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, + &cifs_fscache_inode_object_def, + cifsi); + cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", + cifs_sb->tcon->fscache, cifsi->fscache); +} + +void cifs_fscache_release_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cFYI(1, "CIFS releasing inode cookie (0x%p)", + cifsi->fscache); + fscache_relinquish_cookie(cifsi->fscache, 0); + cifsi->fscache = NULL; + } +} + +static void cifs_fscache_disable_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + if (cifsi->fscache) { + cFYI(1, "CIFS disabling inode cookie (0x%p)", + cifsi->fscache); + fscache_relinquish_cookie(cifsi->fscache, 1); + cifsi->fscache = NULL; + } +} + +void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) +{ + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) + cifs_fscache_disable_inode_cookie(inode); + else { + cifs_fscache_enable_inode_cookie(inode); + cFYI(1, "CIFS: fscache inode cookie set"); + } +} + +void cifs_fscache_reset_inode_cookie(struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct fscache_cookie *old = cifsi->fscache; + + if (cifsi->fscache) { + /* retire the current fscache cache and get a new one */ + fscache_relinquish_cookie(cifsi->fscache, 1); + + cifsi->fscache = fscache_acquire_cookie(cifs_sb->tcon->fscache, + &cifs_fscache_inode_object_def, + cifsi); + cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", + cifsi->fscache, old); + } +} + +int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + if (PageFsCache(page)) { + struct inode *inode = page->mapping->host; + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + cFYI(1, "CIFS: fscache release page (0x%p/0x%p)", + page, cifsi->fscache); + if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) + return 0; + } + + return 1; +} + +static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, + int error) +{ + cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)", + page, error); + if (!error) + SetPageUptodate(page); + unlock_page(page); +} + +/* + * Retrieve a page from FS-Cache + */ +int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p", + CIFS_I(inode)->fscache, page, inode); + ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, + cifs_readpage_from_fscache_complete, + NULL, + GFP_KERNEL); + switch (ret) { + + case 0: /* page found in fscache, read submitted */ + cFYI(1, "CIFS: readpage_from_fscache: submitted"); + return ret; + case -ENOBUFS: /* page won't be cached */ + case -ENODATA: /* page not in cache */ + cFYI(1, "CIFS: readpage_from_fscache %d", ret); + return 1; + + default: + cERROR(1, "unknown error ret = %d", ret); + } + return ret; +} + +/* + * Retrieve a set of pages from FS-Cache + */ +int __cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + int ret; + + cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)", + CIFS_I(inode)->fscache, *nr_pages, inode); + ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, + pages, nr_pages, + cifs_readpage_from_fscache_complete, + NULL, + mapping_gfp_mask(mapping)); + switch (ret) { + case 0: /* read submitted to the cache for all pages */ + cFYI(1, "CIFS: readpages_from_fscache: submitted"); + return ret; + + case -ENOBUFS: /* some pages are not cached and can't be */ + case -ENODATA: /* some pages are not cached */ + cFYI(1, "CIFS: readpages_from_fscache: no page"); + return 1; + + default: + cFYI(1, "unknown error ret = %d", ret); + } + + return ret; +} + +void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +{ + int ret; + + cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p", + CIFS_I(inode)->fscache, page, inode); + ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); + if (ret != 0) + fscache_uncache_page(CIFS_I(inode)->fscache, page); +} + +void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifsi->fscache; + + cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie); + fscache_wait_on_page_write(cookie, page); + fscache_uncache_page(cookie, page); +} + diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h new file mode 100644 index 0000000..31b88ec2 --- /dev/null +++ b/fs/cifs/fscache.h @@ -0,0 +1,136 @@ +/* + * fs/cifs/fscache.h - CIFS filesystem cache interface definitions + * + * Copyright (c) 2010 Novell, Inc. + * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _CIFS_FSCACHE_H +#define _CIFS_FSCACHE_H + +#include <linux/fscache.h> + +#include "cifsglob.h" + +#ifdef CONFIG_CIFS_FSCACHE + +extern struct fscache_netfs cifs_fscache_netfs; +extern const struct fscache_cookie_def cifs_fscache_server_index_def; +extern const struct fscache_cookie_def cifs_fscache_super_index_def; +extern const struct fscache_cookie_def cifs_fscache_inode_object_def; + +extern int cifs_fscache_register(void); +extern void cifs_fscache_unregister(void); + +/* + * fscache.c + */ +extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *); +extern void cifs_fscache_get_super_cookie(struct cifsTconInfo *); +extern void cifs_fscache_release_super_cookie(struct cifsTconInfo *); + +extern void cifs_fscache_release_inode_cookie(struct inode *); +extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); +extern void cifs_fscache_reset_inode_cookie(struct inode *); + +extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); +extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); +extern int __cifs_readpage_from_fscache(struct inode *, struct page *); +extern int __cifs_readpages_from_fscache(struct inode *, + struct address_space *, + struct list_head *, + unsigned *); + +extern void __cifs_readpage_to_fscache(struct inode *, struct page *); + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) +{ + if (PageFsCache(page)) + __cifs_fscache_invalidate_page(page, inode); +} + +static inline int cifs_readpage_from_fscache(struct inode *inode, + struct page *page) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpage_from_fscache(inode, page); + + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + if (CIFS_I(inode)->fscache) + return __cifs_readpages_from_fscache(inode, mapping, pages, + nr_pages); + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __cifs_readpage_to_fscache(inode, page); +} + +#else /* CONFIG_CIFS_FSCACHE */ +static inline int cifs_fscache_register(void) { return 0; } +static inline void cifs_fscache_unregister(void) {} + +static inline void +cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {} +static inline void +cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {} +static inline void cifs_fscache_get_super_cookie(struct cifsTconInfo *tcon) {} +static inline void +cifs_fscache_release_super_cookie(struct cifsTconInfo *tcon) {} + +static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_set_inode_cookie(struct inode *inode, + struct file *filp) {} +static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} +static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) +{ + return 1; /* May release page */ +} + +static inline void cifs_fscache_invalidate_page(struct page *page, + struct inode *inode) {} +static inline int +cifs_readpage_from_fscache(struct inode *inode, struct page *page) +{ + return -ENOBUFS; +} + +static inline int cifs_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void cifs_readpage_to_fscache(struct inode *inode, + struct page *page) {} + +#endif /* CONFIG_CIFS_FSCACHE */ + +#endif /* _CIFS_FSCACHE_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6f0683c..dc4c47a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -29,6 +29,7 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "fscache.h" static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral) @@ -288,7 +289,7 @@ int cifs_get_file_info_unix(struct file *filp) struct inode *inode = filp->f_path.dentry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsTconInfo *tcon = cifs_sb->tcon; - struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; + struct cifsFileInfo *cfile = filp->private_data; xid = GetXid(); rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data); @@ -515,7 +516,7 @@ int cifs_get_file_info(struct file *filp) struct inode *inode = filp->f_path.dentry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsTconInfo *tcon = cifs_sb->tcon; - struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; + struct cifsFileInfo *cfile = filp->private_data; xid = GetXid(); rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data); @@ -723,18 +724,17 @@ cifs_find_inode(struct inode *inode, void *opaque) { struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + /* don't match inode with different uniqueid */ if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) return 0; - /* - * uh oh -- it's a directory. We can't use it since hardlinked dirs are - * verboten. Disable serverino and return it as if it were found, the - * caller can discard it, generate a uniqueid and retry the find - */ - if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) + return 0; + + /* if it's not a directory or has no dentries, then flag it */ + if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; - cifs_autodisable_serverino(CIFS_SB(inode->i_sb)); - } return 1; } @@ -748,6 +748,27 @@ cifs_init_inode(struct inode *inode, void *opaque) return 0; } +/* + * walk dentry list for an inode and report whether it has aliases that + * are hashed. We use this to determine if a directory inode can actually + * be used. + */ +static bool +inode_has_hashed_dentries(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&dcache_lock); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + if (!d_unhashed(dentry) || IS_ROOT(dentry)) { + spin_unlock(&dcache_lock); + return true; + } + } + spin_unlock(&dcache_lock); + return false; +} + /* Given fattrs, get a corresponding inode */ struct inode * cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) @@ -763,12 +784,16 @@ retry_iget5_locked: inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); if (inode) { - /* was there a problematic inode number collision? */ + /* was there a potentially problematic inode collision? */ if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { - iput(inode); - fattr->cf_uniqueid = iunique(sb, ROOT_I); fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; - goto retry_iget5_locked; + + if (inode_has_hashed_dentries(inode)) { + cifs_autodisable_serverino(CIFS_SB(sb)); + iput(inode); + fattr->cf_uniqueid = iunique(sb, ROOT_I); + goto retry_iget5_locked; + } } cifs_fattr_to_inode(inode, fattr); @@ -776,6 +801,10 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; +#ifdef CONFIG_CIFS_FSCACHE + /* initialize per-inode cache cookie pointer */ + CIFS_I(inode)->fscache = NULL; +#endif unlock_new_inode(inode); } } @@ -807,6 +836,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) if (!inode) return ERR_PTR(-ENOMEM); +#ifdef CONFIG_CIFS_FSCACHE + /* populate tcon->resource_id */ + cifs_sb->tcon->resource_id = CIFS_I(inode)->uniqueid; +#endif + if (rc && cifs_sb->tcon->ipc) { cFYI(1, "ipc connection - fake read inode"); inode->i_mode |= S_IFDIR; @@ -1568,6 +1602,7 @@ cifs_invalidate_mapping(struct inode *inode) cifs_i->write_behind_rc = rc; } invalidate_remote_inode(inode); + cifs_fscache_reset_inode_cookie(inode); } int cifs_revalidate_file(struct file *filp) diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 505926f..9d38a71 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -41,8 +41,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) __u64 ExtAttrMask = 0; __u64 caps; struct cifsTconInfo *tcon; - struct cifsFileInfo *pSMBFile = - (struct cifsFileInfo *)filep->private_data; + struct cifsFileInfo *pSMBFile = filep->private_data; #endif /* CONFIG_CIFS_POSIX */ xid = GetXid(); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1394aa3..3ccadc1 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -498,7 +498,6 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) struct cifsTconInfo *tcon; struct cifsInodeInfo *pCifsInode; struct cifsFileInfo *netfile; - int rc; cFYI(1, "Checking for oplock break or dnotify response"); if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && @@ -583,13 +582,18 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) pCifsInode->clientCanCacheAll = false; if (pSMB->OplockLevel == 0) pCifsInode->clientCanCacheRead = false; - rc = slow_work_enqueue(&netfile->oplock_break); - if (rc) { - cERROR(1, "failed to enqueue oplock " - "break: %d\n", rc); - } else { - netfile->oplock_break_cancelled = false; - } + + /* + * cifs_oplock_break_put() can't be called + * from here. Get reference after queueing + * succeeded. cifs_oplock_break() will + * synchronize using GlobalSMSSeslock. + */ + if (queue_work(system_nrt_wq, + &netfile->oplock_break)) + cifs_oplock_break_get(netfile); + netfile->oplock_break_cancelled = false; + read_unlock(&GlobalSMBSeslock); read_unlock(&cifs_tcp_ses_lock); return true; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index d35d528..f978511 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -61,6 +61,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRremcd, -EACCES}, {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, + {ERRwriteprot, -EROFS}, {ERRbadshare, -ETXTBSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, @@ -139,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = { * Returns 0 on failure. */ static int -cifs_inet_pton(const int address_family, const char *cp, void *dst) +cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) { int ret = 0; /* calculate length by finding first slash or NULL */ if (address_family == AF_INET) - ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL); + ret = in4_pton(cp, len, dst, '\\', NULL); else if (address_family == AF_INET6) - ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL); + ret = in6_pton(cp, len, dst , '\\', NULL); - cFYI(DBG2, "address conversion returned %d for %s", ret, cp); + cFYI(DBG2, "address conversion returned %d for %*.*s", + ret, len, len, cp); if (ret > 0) ret = 1; return ret; @@ -164,43 +166,66 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst) * Returns 0 on failure. */ int -cifs_convert_address(char *src, void *dst) +cifs_convert_address(struct sockaddr *dst, const char *src, int len) { - int rc; - char *pct, *endp; + int rc, alen, slen; + const char *pct; + char *endp, scope_id[13]; struct sockaddr_in *s4 = (struct sockaddr_in *) dst; struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; /* IPv4 address */ - if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) { + if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) { s4->sin_family = AF_INET; return 1; } - /* temporarily terminate string */ - pct = strchr(src, '%'); - if (pct) - *pct = '\0'; - - rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr); - - /* repair temp termination (if any) and make pct point to scopeid */ - if (pct) - *pct++ = '%'; + /* attempt to exclude the scope ID from the address part */ + pct = memchr(src, '%', len); + alen = pct ? pct - src : len; + rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr); if (!rc) return rc; s6->sin6_family = AF_INET6; if (pct) { + /* grab the scope ID */ + slen = len - (alen + 1); + if (slen <= 0 || slen > 12) + return 0; + memcpy(scope_id, pct + 1, slen); + scope_id[slen] = '\0'; + s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); - if (!*pct || *endp) + if (endp != scope_id + slen) return 0; } return rc; } +int +cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, + const unsigned short int port) +{ + if (!cifs_convert_address(dst, src, len)) + return 0; + + switch (dst->sa_family) { + case AF_INET: + ((struct sockaddr_in *)dst)->sin_port = htons(port); + break; + case AF_INET6: + ((struct sockaddr_in6 *)dst)->sin6_port = htons(port); + break; + default: + return 0; + } + + return 1; +} + /***************************************************************************** convert a NT status code to a dos class/code *****************************************************************************/ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index daf1753..d5e591f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -847,6 +847,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); + if (tmp_buf == NULL) { + rc = -ENOMEM; + break; + } + for (i = 0; (i < num_to_fill) && (rc == 0); i++) { if (current_entry == NULL) { /* evaluate whether this case is an error */ diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h index c5084d2..7f16cb8 100644 --- a/fs/cifs/smberr.h +++ b/fs/cifs/smberr.h @@ -76,6 +76,7 @@ #define ERRnofiles 18 /* A File Search command can find no more files matching the specified criteria. */ +#define ERRwriteprot 19 /* media is write protected */ #define ERRgeneral 31 #define ERRbadshare 32 /* The sharing mode specified for an Open conflicts with existing FIDs on diff --git a/fs/compat.c b/fs/compat.c index 6490d21..c6fda9a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -8,7 +8,7 @@ * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@suse.cz) + * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 618f381..5d9b936 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -4,7 +4,7 @@ * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@suse.cz) + * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) * * These routines maintain argument size conversion between 32bit and 64bit * ioctls. @@ -578,8 +578,11 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, } /* Bluetooth ioctls */ -#define HCIUARTSETPROTO _IOW('U', 200, int) -#define HCIUARTGETPROTO _IOR('U', 201, int) +#define HCIUARTSETPROTO _IOW('U', 200, int) +#define HCIUARTGETPROTO _IOR('U', 201, int) +#define HCIUARTGETDEVICE _IOR('U', 202, int) +#define HCIUARTSETFLAGS _IOW('U', 203, int) +#define HCIUARTGETFLAGS _IOR('U', 204, int) #define BNEPCONNADD _IOW('B', 200, int) #define BNEPCONNDEL _IOW('B', 201, int) @@ -1298,6 +1301,8 @@ COMPATIBLE_IOCTL(HCISETLINKPOL) COMPATIBLE_IOCTL(HCISETLINKMODE) COMPATIBLE_IOCTL(HCISETACLMTU) COMPATIBLE_IOCTL(HCISETSCOMTU) +COMPATIBLE_IOCTL(HCIBLOCKADDR) +COMPATIBLE_IOCTL(HCIUNBLOCKADDR) COMPATIBLE_IOCTL(HCIINQUIRY) COMPATIBLE_IOCTL(HCIUARTSETPROTO) COMPATIBLE_IOCTL(HCIUARTGETPROTO) diff --git a/fs/dcache.c b/fs/dcache.c index c8c78ba..86d4db1 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) diff --git a/fs/direct-io.c b/fs/direct-io.c index 7600aac..a10cb91 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret) +static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) { ssize_t transferred = 0; @@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) transferred = dio->i_size - offset; } - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private); - - if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - if (ret == 0) ret = dio->page_errors; if (ret == 0) @@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (ret == 0) ret = transferred; + if (dio->end_io && dio->result) { + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private, ret, is_async); + } else if (is_async) { + aio_complete(dio->iocb, ret, 0); + } + + if (dio->flags & DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); + return ret; } @@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - int ret = dio_complete(dio, dio->iocb->ki_pos, 0); - aio_complete(dio->iocb, ret, 0); + dio_complete(dio, dio->iocb->ki_pos, 0, true); kfree(dio); } } @@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_unlock_irqrestore(&dio->bio_lock, flags); if (ret2 == 0) { - ret = dio_complete(dio, offset, ret); + ret = dio_complete(dio, offset, ret, false); kfree(dio); } else BUG_ON(ret != -EIOCBQUEUED); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index c0d35c6..37a34c2 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id) for (i = 0 ; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry(con, h, &connection_hash[i], list) { - if (con && con->sctp_assoc == assoc_id) { + if (con->sctp_assoc == assoc_id) { mutex_unlock(&connections_lock); return con; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 2c6ad51..ef17e01 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = { int __init dlm_netlink_init(void) { - int rv; - - rv = genl_register_family(&family); - if (rv) - return rv; - - rv = genl_register_ops(&family, &dlm_nl_ops); - if (rv < 0) - goto err; - return 0; - err: - genl_unregister_family(&family); - return rv; + return genl_register_family_with_ops(&family, &dlm_nl_ops, 1); } void dlm_netlink_exit(void) { - genl_unregister_ops(&family, &dlm_nl_ops); genl_unregister_family(&family); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 1cc0876..a2e3b56 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -762,7 +762,7 @@ ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, /** * ecryptfs_init_crypt_ctx - * @crypt_stat: Uninitilized crypt stats structure + * @crypt_stat: Uninitialized crypt stats structure * * Initialize the crypto context. * diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 2d8dbce..46c4dd8 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -31,9 +31,9 @@ static struct mutex ecryptfs_msg_ctx_lists_mux; static struct hlist_head *ecryptfs_daemon_hash; struct mutex ecryptfs_daemon_hash_mux; -static int ecryptfs_hash_buckets; +static int ecryptfs_hash_bits; #define ecryptfs_uid_hash(uid) \ - hash_long((unsigned long)uid, ecryptfs_hash_buckets) + hash_long((unsigned long)uid, ecryptfs_hash_bits) static u32 ecryptfs_msg_counter; static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; @@ -486,18 +486,19 @@ int ecryptfs_init_messaging(void) } mutex_init(&ecryptfs_daemon_hash_mux); mutex_lock(&ecryptfs_daemon_hash_mux); - ecryptfs_hash_buckets = 1; - while (ecryptfs_number_of_users >> ecryptfs_hash_buckets) - ecryptfs_hash_buckets++; + ecryptfs_hash_bits = 1; + while (ecryptfs_number_of_users >> ecryptfs_hash_bits) + ecryptfs_hash_bits++; ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) - * ecryptfs_hash_buckets), GFP_KERNEL); + * (1 << ecryptfs_hash_bits)), + GFP_KERNEL); if (!ecryptfs_daemon_hash) { rc = -ENOMEM; printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); mutex_unlock(&ecryptfs_daemon_hash_mux); goto out; } - for (i = 0; i < ecryptfs_hash_buckets; i++) + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); mutex_unlock(&ecryptfs_daemon_hash_mux); ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) @@ -554,7 +555,7 @@ void ecryptfs_release_messaging(void) int i; mutex_lock(&ecryptfs_daemon_hash_mux); - for (i = 0; i < ecryptfs_hash_buckets; i++) { + for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; hlist_for_each_entry(daemon, elem, @@ -28,7 +28,6 @@ #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> -#include <linux/smp_lock.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> @@ -653,6 +652,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else stack_base = vma->vm_start - stack_expand; #endif + current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); if (ret) ret = -EFAULT; @@ -1891,13 +1891,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) */ clear_thread_flag(TIF_SIGPENDING); - /* - * lock_kernel() because format_corename() is controlled by sysctl, which - * uses lock_kernel() - */ - lock_kernel(); ispipe = format_corename(corename, signr); - unlock_kernel(); if (ispipe) { int dump_count; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index 522b154..e8c6ba0 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -31,6 +31,7 @@ config EXT3_FS config EXT3_DEFAULTS_TO_ORDERED bool "Default to 'data=ordered' in ext3" depends on EXT3_FS + default y help The journal mode options for ext3 have different tradeoffs between when data is guaranteed to be on disk and diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 735f019..001eb0e 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1149,9 +1149,25 @@ static int walk_page_buffers( handle_t *handle, static int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { + int dirty = buffer_dirty(bh); + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - return ext3_journal_get_write_access(handle, bh); + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext3_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext3_journal_dirty_metadata(handle, bh); + return ret; } /* @@ -1625,10 +1641,7 @@ static int ext3_writeback_writepage(struct page *page, goto out_fail; } - if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) - ret = nobh_writepage(page, ext3_get_block, wbc); - else - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, ext3_get_block, wbc); err = ext3_journal_stop(handle); if (!ret) @@ -1922,17 +1935,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - /* - * For "nobh" option, we can only work if we don't need to - * read-in the page - otherwise we create buffers to do the IO. - */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && - ext3_should_writeback_data(inode) && PageUptodate(page)) { - zero_user(page, offset, length); - set_page_dirty(page); - goto unlock; - } - if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -2284,27 +2286,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, depth); /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext3_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. - * - * If this block has already been committed to the - * journal, a revoke record will be written. And - * revoke records must be emitted *before* clearing - * this block's bit in the bitmaps. - */ - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - - /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * @@ -2327,6 +2308,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, truncate_restart_transaction(handle, inode); } + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. Thus + * we don't allow a block to be reallocated until + * a transaction freeing it has fully committed. + * + * We also have to make sure journal replay after a + * crash does not overwrite non-journaled data blocks + * with old metadata when the block got reallocated for + * data. Thus we have to store a revoke record for a + * block in the same transaction in which we free the + * block. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + ext3_free_blocks(handle, inode, nr, 1); if (parent_bh) { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index ee18408..2b35ddb 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - unsigned long offset; struct buffer_head * bh; struct ext3_dir_entry_2 *de; struct super_block * sb; @@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, ext3_mark_inode_dirty(handle, dir); } blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0, offset = 0; block < blocks; block++) { + for (block = 0; block < blocks; block++) { bh = ext3_bread(handle, dir, block, 0, &retval); if(!bh) return retval; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 54351ac..0ccd7b1 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, ext3_fsblk_t n_blocks_count) { ext3_fsblk_t o_blocks_count; - unsigned long o_groups_count; ext3_grpblk_t last; ext3_grpblk_t add; struct buffer_head * bh; @@ -976,7 +975,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, * yet: we're going to revalidate es->s_blocks_count after * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); - o_groups_count = EXT3_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6c953bb..9650a95 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -661,9 +661,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) */ seq_puts(seq, ",barrier="); seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); - seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -1255,10 +1252,12 @@ set_qf_format: *n_blocks_count = option; break; case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated nobh option"); break; case Opt_bh: - clear_opt(sbi->s_mount_opt, NOBH); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated bh option"); break; default: ext3_msg(sb, KERN_ERR, @@ -2001,14 +2000,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) break; } - if (test_opt(sb, NOBH)) { - if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) { - ext3_msg(sb, KERN_WARNING, - "warning: ignoring nobh option - " - "it is supported only with writeback mode"); - clear_opt(sbi->s_mount_opt, NOBH); - } - } /* * The journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index feaf498..5e2ed45 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; else { inode->i_mode = mode; + inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); if (error == 0) acl = NULL; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 95b7594..bd30799 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -377,14 +377,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_grpblk_t bit; unsigned int i; struct ext4_group_desc *desc; - struct ext4_super_block *es; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; struct ext4_group_info *grp; - sbi = EXT4_SB(sb); - es = sbi->s_es; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -477,7 +474,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; - sb->s_dirt = 1; error_return: brelse(bitmap_bh); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5b6973f..3db5084 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -229,16 +229,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); return 0; + } while (n) { entry = rb_entry(n, struct ext4_system_zone, node); if (start_blk + count - 1 < entry->start_blk) n = n->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; - else + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); return 0; + } } return 1; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ea5e6cb..374510f 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) } -int ext4_check_dir_entry(const char *function, struct inode *dir, - struct ext4_dir_entry_2 *de, - struct buffer_head *bh, - unsigned int offset) +int __ext4_check_dir_entry(const char *function, unsigned int line, + struct inode *dir, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, + unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, @@ -83,11 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - ext4_error_inode(function, dir, - "bad entry in directory: %s - block=%llu" + ext4_error_inode(dir, function, line, bh->b_blocknr, + "bad entry in directory: %s - " "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned long long) bh->b_blocknr, - (unsigned) (offset%bh->b_size), offset, + error_msg, (unsigned) (offset%bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; @@ -121,7 +121,8 @@ static int ext4_readdir(struct file *filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX); + ext4_clear_inode_flag(filp->f_path.dentry->d_inode, + EXT4_INODE_INDEX); } stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); @@ -193,7 +194,7 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (!ext4_check_dir_entry("ext4_readdir", inode, de, + if (!ext4_check_dir_entry(inode, de, bh, offset)) { /* * On error, skip the f_pos to the next block @@ -343,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct dir_private_info *info; int len; - info = (struct dir_private_info *) dir_file->private_data; + info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 19a4de5..e03841d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -57,10 +57,13 @@ #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode(__func__, (inode), (fmt), ## a) + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) #define EXT4_ERROR_FILE(file, fmt, a...) \ - ext4_error_file(__func__, (file), (fmt), ## a) + ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -167,13 +170,15 @@ struct mpage_da_data { }; #define EXT4_IO_UNWRITTEN 0x1 typedef struct ext4_io_end { - struct list_head list; /* per-file finished AIO list */ + struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ struct page *page; /* page struct for buffer write */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ + struct kiocb *iocb; /* iocb struct for AIO */ + int result; /* error value for AIO */ } ext4_io_end_t; /* @@ -460,7 +465,7 @@ struct ext4_new_group_data { }; /* - * Flags used by ext4_get_blocks() + * Flags used by ext4_map_blocks() */ /* Allocate any needed blocks and/or convert an unitialized extent to be an initialized ext4 */ @@ -873,7 +878,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ @@ -982,7 +986,7 @@ struct ext4_super_block { __le32 s_last_orphan; /* start of list of inodes to delete */ __le32 s_hash_seed[4]; /* HTREE hash seed */ __u8 s_def_hash_version; /* Default hash version to use */ - __u8 s_reserved_char_pad; + __u8 s_jnl_backup_type; __le16 s_desc_size; /* size of group descriptor */ /*100*/ __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ @@ -1000,12 +1004,34 @@ struct ext4_super_block { __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad2; + __u8 s_reserved_char_pad; __le16 s_reserved_pad; __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ - __u32 s_reserved[160]; /* Padding to the end of the block */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32]; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32]; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_reserved[112]; /* Padding to the end of the block */ }; +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + #ifdef __KERNEL__ /* @@ -1143,6 +1169,9 @@ struct ext4_sb_info { /* workqueue for dio unwritten */ struct workqueue_struct *dio_unwritten_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1313,6 +1342,10 @@ EXT4_INODE_BIT_FNS(state, state_flags) #define EXT4_DEFM_JMODE_DATA 0x0020 #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 /* * Default journal batch times @@ -1379,6 +1412,43 @@ struct ext4_dir_entry_2 { #define EXT4_MAX_REC_LEN ((1<<16)-1) /* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_CACHE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 */ @@ -1510,9 +1580,11 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, ext4_init_block_bitmap(sb, NULL, group, desc) /* dir.c */ -extern int ext4_check_dir_entry(const char *, struct inode *, - struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct ext4_dir_entry_2 *, + struct buffer_head *, unsigned int); +#define ext4_check_dir_entry(dir, de, bh, offset) \ + __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1601,8 +1673,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); extern int ext4_ext_migrate(struct inode *); /* namei.c */ -extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize); -extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, @@ -1616,25 +1686,38 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void __ext4_error(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) -extern void ext4_error_inode(const char *, struct inode *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext4_error_file(const char *, struct file *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void __ext4_std_error(struct super_block *, const char *, int); -extern void ext4_abort(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void __ext4_warning(struct super_block *, const char *, +extern void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ + __LINE__, ## message) +extern void ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, const char *, ...) + __attribute__ ((format (printf, 5, 6))); +extern void ext4_error_file(struct file *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ + __LINE__, ## message) +extern void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) + __attribute__ ((format (printf, 4, 5))); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ + __LINE__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, - const char *, const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern void __ext4_grp_locked_error(const char *, unsigned int, \ + struct super_block *, ext4_group_t, \ + unsigned long, ext4_fsblk_t, \ + const char *, ...) + __attribute__ ((format (printf, 7, 8))); +#define ext4_grp_locked_error(sb, grp, message...) \ + __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -1768,7 +1851,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ - __ext4_std_error((sb), __func__, (errno)); \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ } while (0) #ifdef CONFIG_SMP @@ -1860,6 +1943,12 @@ static inline void ext4_unlock_group(struct super_block *sb, spin_unlock(ext4_group_lock_ptr(sb, group)); } +static inline void ext4_mark_super_dirty(struct super_block *sb) +{ + if (EXT4_SB(sb)->s_journal == NULL) + sb->s_dirt =1; +} + /* * Inodes and files operations */ @@ -1905,9 +1994,6 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_get_blocks(handle_t *handle, struct inode *inode, - sector_t block, unsigned int max_blocks, - struct buffer_head *bh, int flags); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); /* move_extent.c */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 53d2764..6e272ef 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,29 +6,29 @@ #include <trace/events/ext4.h> -int __ext4_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh) +int __ext4_journal_get_undo_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_get_undo_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, + ext4_journal_abort_handle(where, line, __func__, bh, handle, err); } return err; } -int __ext4_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh) +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, + ext4_journal_abort_handle(where, line, __func__, bh, handle, err); } return err; @@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, * If the handle isn't valid we're not journaling, but we still need to * call into ext4_journal_revoke() to put the buffer head. */ -int __ext4_forget(const char *where, handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - ext4_fsblk_t blocknr) +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; @@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, BUFFER_TRACE(bh, "call jbd2_journal_forget"); err = jbd2_journal_forget(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); return err; } return 0; @@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, BUFFER_TRACE(bh, "call jbd2_journal_revoke"); err = jbd2_journal_revoke(handle, blocknr, bh); if (err) { - ext4_journal_abort_handle(where, __func__, bh, handle, err); - ext4_abort(inode->i_sb, __func__, + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + __ext4_abort(inode->i_sb, where, line, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); return err; } -int __ext4_journal_get_create_access(const char *where, +int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { int err = 0; @@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where, if (ext4_handle_valid(handle)) { err = jbd2_journal_get_create_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); } return err; } -int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, - struct inode *inode, struct buffer_head *bh) +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); } else { if (inode) mark_buffer_dirty_inode(bh, inode); @@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - ext4_error(inode->i_sb, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long) bh->b_blocknr); + struct ext4_super_block *es; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_block = + cpu_to_le64(bh->b_blocknr); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "IO error syncing itable block"); err = -EIO; } } } return err; } + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb) +{ + struct buffer_head *bh = EXT4_SB(sb)->s_sbh; + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } else + sb->s_dirt = 1; + return err; +} diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index dade0c0..b0bd792 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* * Wrapper functions with which ext4 calls into JBD. */ -void ext4_journal_abort_handle(const char *caller, const char *err_fn, +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); -int __ext4_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh); +int __ext4_journal_get_undo_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); -int __ext4_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh); +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); -int __ext4_forget(const char *where, handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - ext4_fsblk_t blocknr); +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); -int __ext4_journal_get_create_access(const char *where, +int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); -int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, - struct inode *inode, struct buffer_head *bh); +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh); + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb); #define ext4_journal_get_undo_access(handle, bh) \ - __ext4_journal_get_undo_access(__func__, (handle), (bh)) + __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__func__, (handle), (bh)) + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ - __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ - (block_nr)) + __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ + (bh), (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__func__, (handle), (bh)) + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ - __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) + __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ + (bh)) +#define ext4_handle_dirty_super(handle, sb) \ + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); -int __ext4_journal_stop(const char *where, handle_t *handle); +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) } #define ext4_journal_stop(handle) \ - __ext4_journal_stop(__func__, (handle)) + __ext4_journal_stop(__func__, __LINE__, (handle)) static inline handle_t *ext4_journal_current_handle(void) { @@ -308,17 +316,15 @@ static inline int ext4_should_writeback_data(struct inode *inode) * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking * i_mutex for direct I/O reads. This only works for extent-based - * files, and it doesn't work for nobh or if data journaling is - * enabled, since the dioread_nolock code uses b_private to pass - * information back to the I/O completion handler, and this conflicts - * with the jbd's use of b_private. + * files, and it doesn't work if data journaling is enabled, since the + * dioread_nolock code uses b_private to pass information back to the + * I/O completion handler, and this conflicts with the jbd's use of + * b_private. */ static inline int ext4_should_dioread_nolock(struct inode *inode) { if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) return 0; - if (test_opt(inode->i_sb, NOBH)) - return 0; if (!S_ISREG(inode->i_mode)) return 0; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 377309c..06328d3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -401,9 +401,9 @@ static int ext4_valid_extent_entries(struct inode *inode, return 1; } -static int __ext4_ext_check(const char *function, struct inode *inode, - struct ext4_extent_header *eh, - int depth) +static int __ext4_ext_check(const char *function, unsigned int line, + struct inode *inode, struct ext4_extent_header *eh, + int depth) { const char *error_msg; int max = 0; @@ -436,7 +436,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return 0; corrupted: - ext4_error_inode(function, inode, + ext4_error_inode(inode, function, line, 0, "bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", error_msg, le16_to_cpu(eh->eh_magic), @@ -447,7 +447,7 @@ corrupted: } #define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, inode, eh, depth) + __ext4_ext_check(__func__, __LINE__, inode, eh, depth) int ext4_ext_check_inode(struct inode *inode) { @@ -1083,7 +1083,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, { struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; - struct ext4_extent_idx *fidx; struct buffer_head *bh; ext4_fsblk_t newblock; int err = 0; @@ -1144,10 +1143,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(curp->p_idx, newblock); neh = ext_inode_hdr(inode); - fidx = EXT_FIRST_INDEX(neh); ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), - le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); + le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), + idx_pblock(EXT_FIRST_INDEX(neh))); neh->eh_depth = cpu_to_le16(path->p_depth + 1); err = ext4_ext_dirty(handle, inode, curp); @@ -2937,7 +2936,7 @@ fix_extent_len: * One of more index blocks maybe needed if the extent tree grow after * the unintialized extent split. To prevent ENOSPC occur at the IO * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitilized extent called at this time will be split + * the IO. The uninitialized extent called at this time will be split * into three uninitialized extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). @@ -2954,7 +2953,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, struct ext4_extent *ex1 = NULL; struct ext4_extent *ex2 = NULL; struct ext4_extent *ex3 = NULL; - struct ext4_extent_header *eh; ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; @@ -2971,7 +2969,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); @@ -3058,7 +3055,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, err = PTR_ERR(path); goto out; } - eh = path[depth].p_hdr; ex = path[depth].p_ext; if (ex2 != &newex) ex2 = ex; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5313ae4..ee92b66 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -70,7 +70,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); size_t length = iov_length(iov, nr_segs); - if (pos > sbi->s_bitmap_maxbytes) + if ((pos > sbi->s_bitmap_maxbytes || + (pos == sbi->s_bitmap_maxbytes && length > 0))) return -EFBIG; if (pos + length > sbi->s_bitmap_maxbytes) { @@ -123,7 +124,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (!IS_ERR(cp)) { memcpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); } } return dquot_file_open(inode, filp); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 25c4b31..ac37750 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -279,7 +279,7 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); } else ext4_error(sb, "bit already cleared for inode %lu", ino); @@ -965,7 +965,7 @@ got: percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 42272d6..a0ab375 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -221,6 +221,7 @@ void ext4_delete_inode(struct inode *inode) "couldn't extend journal (err %d)", err); stop_handle: ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); goto no_delete; } } @@ -337,9 +338,11 @@ static int ext4_block_to_path(struct inode *inode, return n; } -static int __ext4_check_blockref(const char *function, struct inode *inode, +static int __ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) { + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; __le32 *bref = p; unsigned int blk; @@ -348,8 +351,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - ext4_error_inode(function, inode, - "invalid block reference %u", blk); + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); return -EIO; } } @@ -358,11 +362,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, #define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ + __ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ EXT4_ADDR_PER_BLOCK((inode)->i_sb)) #define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ + __ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ EXT4_NDIR_BLOCKS) /** @@ -1128,20 +1134,24 @@ void ext4_da_update_reserve_space(struct inode *inode, ext4_discard_preallocations(inode); } -static int check_block_validity(struct inode *inode, const char *func, +static int __check_block_validity(struct inode *inode, const char *func, + unsigned int line, struct ext4_map_blocks *map) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, map->m_len)) { - ext4_error_inode(func, inode, - "lblock %lu mapped to illegal pblock %llu " - "(length %d)", (unsigned long) map->m_lblk, - map->m_pblk, map->m_len); + ext4_error_inode(inode, func, line, map->m_pblk, + "lblock %lu mapped to illegal pblock " + "(length %d)", (unsigned long) map->m_lblk, + map->m_len); return -EIO; } return 0; } +#define check_block_validity(inode, map) \ + __check_block_validity((inode), __func__, __LINE__, (map)) + /* * Return the number of contiguous dirty pages in a given inode * starting at page frame idx. @@ -1244,7 +1254,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, __func__, map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -1324,9 +1334,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, - "ext4_map_blocks_after_alloc", - map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -1519,9 +1527,25 @@ static int walk_page_buffers(handle_t *handle, static int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { + int dirty = buffer_dirty(bh); + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - return ext4_journal_get_write_access(handle, bh); + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + return ret; } /* @@ -2194,7 +2218,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) BUG_ON(!handle); /* - * Call ext4_get_blocks() to allocate any delayed allocation + * Call ext4_map_blocks() to allocate any delayed allocation * blocks, or to convert an uninitialized extent to be * initialized (in the case where we have written into * one or more preallocated blocks). @@ -2203,7 +2227,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * indicate that we are on the delayed allocation path. This * affects functions in many different parts of the allocation * call path. This flag exists primarily because we don't - * want to change *many* call functions, so ext4_get_blocks() + * want to change *many* call functions, so ext4_map_blocks() * will set the magic i_delalloc_reserved_flag once the * inode's allocation semaphore is taken. * @@ -2221,6 +2245,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); if (blks < 0) { + struct super_block *sb = mpd->inode->i_sb; + err = blks; /* * If get block returns with error we simply @@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) return 0; if (err == -ENOSPC && - ext4_count_free_blocks(mpd->inode->i_sb)) { + ext4_count_free_blocks(sb)) { mpd->retval = err; return 0; } @@ -2243,16 +2269,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * writepage and writepages will again try to write * the same. */ - ext4_msg(mpd->inode->i_sb, KERN_CRIT, - "delayed block allocation failed for inode %lu at " - "logical offset %llu with max blocks %zd with " - "error %d", mpd->inode->i_ino, - (unsigned long long) next, - mpd->b_size >> mpd->inode->i_blkbits, err); - printk(KERN_CRIT "This should not happen!! " - "Data will be lost\n"); - if (err == -ENOSPC) { - ext4_print_free_blocks(mpd->inode); + if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ext4_msg(sb, KERN_CRIT, + "delayed block allocation failed for inode %lu " + "at logical offset %llu with max blocks %zd " + "with error %d", mpd->inode->i_ino, + (unsigned long long) next, + mpd->b_size >> mpd->inode->i_blkbits, err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(mpd->inode); } /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, @@ -2320,7 +2347,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * XXX Don't go larger than mballoc is willing to allocate * This is a stopgap solution. We eventually need to fold * mpage_da_submit_io() into this function and then call - * ext4_get_blocks() multiple times in a loop + * ext4_map_blocks() multiple times in a loop */ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) goto flush_it; @@ -2553,18 +2580,16 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* * This function is used as a standard get_block_t calback function * when there is no desire to allocate any blocks. It is used as a - * callback function for block_prepare_write(), nobh_writepage(), and - * block_write_full_page(). These functions should only try to map a - * single block at a time. + * callback function for block_prepare_write() and block_write_full_page(). + * These functions should only try to map a single block at a time. * * Since this function doesn't do block allocations even if the caller * requests it by passing in create=1, it is critically important that * any caller checks to make sure that any buffer heads are returned * by this function are either all already mapped or marked for - * delayed allocation before calling nobh_writepage() or - * block_write_full_page(). Otherwise, b_blocknr could be left - * unitialized, and the page write functions will be taken by - * surprise. + * delayed allocation before calling block_write_full_page(). Otherwise, + * b_blocknr could be left unitialized, and the page write functions will + * be taken by surprise. */ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -2749,9 +2774,7 @@ static int ext4_writepage(struct page *page, return __ext4_journalled_writepage(page, len); } - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, noalloc_get_block_write, wbc); - else if (page_bufs && buffer_uninit(page_bufs)) { + if (page_bufs && buffer_uninit(page_bufs)) { ext4_set_bh_endio(page_bufs, inode); ret = block_write_full_page_endio(page, noalloc_get_block_write, wbc, ext4_end_io_buffer_write); @@ -3146,13 +3169,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, int ret, retries = 0; struct page *page; pgoff_t index; - unsigned from, to; struct inode *inode = mapping->host; handle_t *handle; index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; if (ext4_nonda_switch(inode->i_sb)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; @@ -3668,6 +3688,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) return ret; } + if (io->iocb) + aio_complete(io->iocb, io->result, 0); /* clear the DIO AIO unwritten flag */ io->flag = 0; return ret; @@ -3767,6 +3789,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) io->offset = 0; io->size = 0; io->page = NULL; + io->iocb = NULL; + io->result = 0; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3775,7 +3799,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) + ssize_t size, void *private, int ret, + bool is_async) { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; @@ -3784,7 +3809,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) - return; + goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", @@ -3795,12 +3820,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; +out: + if (is_async) + aio_complete(iocb, ret, 0); return; } io_end->offset = offset; io_end->size = size; - io_end->flag = EXT4_IO_UNWRITTEN; + if (is_async) { + io_end->iocb = iocb; + io_end->result = ret; + } wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ @@ -3937,7 +3968,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, return -ENOMEM; /* * we save the io structure for current async - * direct IO, so that later ext4_get_blocks() + * direct IO, so that later ext4_map_blocks() * could flag the io structure whether there * is a unwritten extents needs to be converted * when IO is completed. @@ -4128,17 +4159,6 @@ int ext4_block_truncate_page(handle_t *handle, length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - /* - * For "nobh" option, we can only work if we don't need to - * read-in the page - otherwise we create buffers to do the IO. - */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && - ext4_should_writeback_data(inode) && PageUptodate(page)) { - zero_user(page, offset, length); - set_page_dirty(page); - goto unlock; - } - if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -4488,9 +4508,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - EXT4_ERROR_INODE(inode, - "Read failure block=%llu", - (unsigned long long) nr); + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); continue; } @@ -4502,27 +4521,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, depth); /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * jbd2_journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then jbd2_journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext4_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. - * - * If this block has already been committed to the - * journal, a revoke record will be written. And - * revoke records must be emitted *before* clearing - * this block's bit in the bitmaps. - */ - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); - - /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * @@ -4546,8 +4544,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, blocks_for_truncate(inode)); } + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ ext4_free_blocks(handle, inode, 0, nr, 1, - EXT4_FREE_BLOCKS_METADATA); + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); if (parent_bh) { /* @@ -4805,8 +4815,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - EXT4_ERROR_INODE(inode, "unable to read inode block - " - "block %llu", block); + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4904,8 +4914,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - EXT4_ERROR_INODE(inode, "unable to read inode " - "block %llu", block); + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); brelse(bh); return -EIO; } @@ -4976,7 +4986,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); - if (ei->i_flags & EXT4_HUGE_FILE_FL) { + if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { @@ -5072,7 +5082,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) transaction_t *transaction; tid_t tid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else @@ -5081,7 +5091,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) tid = transaction->t_tid; else tid = journal->j_commit_sequence; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } @@ -5126,7 +5136,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl); ret = -EIO; goto bad_inode; - } else if (ei->i_flags & EXT4_EXTENTS_FL) { + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) @@ -5406,9 +5416,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - EXT4_ERROR_INODE(inode, - "IO error syncing inode (block=%llu)", - (unsigned long long) iloc.bh->b_blocknr); + EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, + "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); @@ -5483,10 +5492,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (attr->ia_size > sbi->s_bitmap_maxbytes) { - error = -EFBIG; - goto err_out; - } + if (attr->ia_size > sbi->s_bitmap_maxbytes) + return -EFBIG; } } @@ -5688,7 +5695,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling - * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks. + * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. @@ -5754,7 +5761,6 @@ static int ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; @@ -5762,7 +5768,6 @@ static int ext4_expand_extra_isize(struct inode *inode, raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - entry = IFIRST(header); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 12b3bc0..4b4ad4b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -446,10 +446,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += first + i; ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %u)", - inode ? inode->i_ino : 0, blocknr, - first + i, e4b->bd_group); + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -712,9 +713,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_grp_locked_error(sb, group, __func__, - "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", - group, free, grp->bb_free); + ext4_grp_locked_error(sb, group, 0, 0, + "%u blocks in bitmap, %u in gd", + free, grp->bb_free); /* * If we intent to continue, we consider group descritor * corrupt and update bb_free using bitmap value @@ -1296,10 +1297,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += block; ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %u)", - inode ? inode->i_ino : 0, blocknr, block, - e4b->bd_group); + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); } mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); e4b->bd_info->bb_counters[order]++; @@ -1788,8 +1789,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * we have free blocks */ - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free blocks as per " "group info. But bitmap says 0", free); break; @@ -1798,8 +1799,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free blocks as per " "group info. But got %d blocks", free, ex.fe_len); /* @@ -1821,8 +1822,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, /* * This is a special case for storages like raid5 - * we try to find stripe-aligned chunks for stripe-size requests - * XXX should do so at least for multiples of stripe size as well + * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, @@ -1999,7 +1999,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ext4_group_t ngroups, group, i; int cr; int err = 0; - int bsbits; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; @@ -2041,8 +2040,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_2order = i - 1; } - bsbits = ac->ac_sb->s_blocksize_bits; - /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ @@ -2094,8 +2091,8 @@ repeat: ac->ac_groups_scanned++; if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && - ac->ac_g_ex.fe_len == sbi->s_stripe) + else if (cr == 1 && sbi->s_stripe && + !(ac->ac_g_ex.fe_len % sbi->s_stripe)) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); @@ -2221,7 +2218,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) rc = seq_open(file, &ext4_mb_seq_groups_ops); if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; m->private = sb; } return rc; @@ -2560,6 +2557,22 @@ int ext4_mb_release(struct super_block *sb) return 0; } +static inline void ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t block, int count) +{ + int ret; + ext4_fsblk_t discard_block; + + discard_block = block + ext4_group_first_block_no(sb, block_group); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + ret = sb_issue_discard(sb, discard_block, count); + if (ret == EOPNOTSUPP) { + ext4_warning(sb, "discard not supported, disabling"); + clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); + } +} + /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. @@ -2579,22 +2592,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) { - int ret; - ext4_fsblk_t discard_block; - - discard_block = entry->start_blk + - ext4_group_first_block_no(sb, entry->group); - trace_ext4_discard_blocks(sb, - (unsigned long long)discard_block, - entry->count); - ret = sb_issue_discard(sb, discard_block, entry->count); - if (ret == EOPNOTSUPP) { - ext4_warning(sb, - "discard not supported, disabling"); - clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); - } - } + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->group, + entry->start_blk, entry->count); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2704,7 +2704,7 @@ void exit_ext4_mballoc(void) /* - * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps + * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int @@ -2712,7 +2712,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_blks) { struct buffer_head *bitmap_bh = NULL; - struct ext4_super_block *es; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; @@ -2725,8 +2724,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - es = sbi->s_es; - err = -EIO; bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); @@ -2812,7 +2809,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); brelse(bitmap_bh); return err; } @@ -2850,7 +2847,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, int bsbits, max; ext4_lblk_t end; loff_t size, orig_size, start_off; - ext4_lblk_t start, orig_start; + ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; @@ -2881,6 +2878,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); + orig_size = size; /* max size of free chunks */ max = 2 << bsbits; @@ -2922,8 +2920,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; size = ac->ac_o_ex.fe_len << bsbits; } - orig_size = size = size >> bsbits; - orig_start = start = start_off >> bsbits; + size = size >> bsbits; + start = start_off >> bsbits; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { @@ -3547,7 +3545,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; - sector_t start; int err = 0; int free = 0; @@ -3567,10 +3564,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - start = ext4_group_first_block_no(sb, group) + bit; mb_debug(1, " free preallocated %u/%u in group %u\n", - (unsigned) start, (unsigned) next - bit, - (unsigned) group); + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); free += next - bit; if (ac) { @@ -3581,7 +3577,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, trace_ext4_mballoc_discard(ac); } - trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, + trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit, next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; @@ -3591,8 +3587,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); - ext4_grp_locked_error(sb, group, - __func__, "free %u, pa_free %u", + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained @@ -3613,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(ac, pa); + trace_ext4_mb_release_group_pa(sb, ac, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -3889,6 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + return; + printk(KERN_ERR "EXT4-fs: Can't allocate:" " Allocation context details:\n"); printk(KERN_ERR "EXT4-fs: status %d flags %d\n", @@ -4255,7 +4253,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) + struct ext4_allocation_request *ar, int *errp) { int freed; struct ext4_allocation_context *ac = NULL; @@ -4299,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; - goto out3; + goto out; } } @@ -4307,13 +4305,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, if (!ac) { ar->len = 0; *errp = -ENOMEM; - goto out1; + goto out; } *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; - goto out2; + goto out; } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; @@ -4322,7 +4320,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_mb_normalize_request(ac, ar); repeat: /* allocate space in core */ - ext4_mb_regular_allocator(ac); + *errp = ext4_mb_regular_allocator(ac); + if (*errp) + goto errout; /* as we've just preallocated more space than * user requested orinally, we store allocated @@ -4333,7 +4333,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); - if (*errp == -EAGAIN) { + if (*errp == -EAGAIN) { /* * drop the reference that we took * in ext4_mb_use_best_found @@ -4344,12 +4344,10 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; goto repeat; - } else if (*errp) { + } else if (*errp) + errout: ext4_discard_allocated_blocks(ac); - ac->ac_b_ex.fe_len = 0; - ar->len = 0; - ext4_mb_show_ac(ac); - } else { + else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } @@ -4358,19 +4356,19 @@ repeat: if (freed) goto repeat; *errp = -ENOSPC; + } + + if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } - ext4_mb_release_context(ac); - -out2: - kmem_cache_free(ext4_ac_cachep, ac); -out1: +out: + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) dquot_free_block(ar->inode, inquota - ar->len); -out3: if (!ar->len) { if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) /* release all the reserved blocks if non delalloc */ @@ -4402,6 +4400,7 @@ static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { + ext4_group_t group = e4b->bd_group; ext4_grpblk_t block; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; @@ -4434,9 +4433,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - ext4_grp_locked_error(sb, e4b->bd_group, __func__, - "Double free of blocks %d (%d %d)", - block, entry->start_blk, entry->count); + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + block, + "Block already on to-be-freed list"); return 0; } } @@ -4494,7 +4493,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; - struct ext4_super_block *es; unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; @@ -4513,7 +4511,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } sbi = EXT4_SB(sb); - es = EXT4_SB(sb)->s_es; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_data_block_valid(sbi, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " @@ -4647,6 +4644,8 @@ do_more: mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, block_group, bit, count); } ret = ext4_free_blks_count(sb, gdp) + count; @@ -4680,7 +4679,7 @@ do_more: put_bh(bitmap_bh); goto do_more; } - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); error_return: if (freed) dquot_free_block(inode, freed); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 6f3a27e..1765c2c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * We have the extent map build with the tmp inode. * Now copy the i_data across */ - ei->i_flags |= EXT4_EXTENTS_FL; + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 52abfa1..5f1ed9f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, */ static int mext_check_null_inode(struct inode *inode1, struct inode *inode2, - const char *function) + const char *function, unsigned int line) { int ret = 0; if (inode1 == NULL) { - __ext4_error(inode2->i_sb, function, + __ext4_error(inode2->i_sb, function, line, "Both inodes should not be NULL: " "inode1 NULL inode2 %lu", inode2->i_ino); ret = -EIO; } else if (inode2 == NULL) { - __ext4_error(inode1->i_sb, function, + __ext4_error(inode1->i_sb, function, line, "Both inodes should not be NULL: " "inode1 %lu inode2 NULL", inode1->i_ino); ret = -EIO; @@ -1084,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) BUG_ON(inode1 == NULL && inode2 == NULL); - ret = mext_check_null_inode(inode1, inode2, __func__); + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); if (ret < 0) goto out; @@ -1121,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) BUG_ON(inode1 == NULL && inode2 == NULL); - ret = mext_check_null_inode(inode1, inode2, __func__); + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); if (ret < 0) goto out; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a43e661..314c0d3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); -unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) -{ - unsigned len = le16_to_cpu(dlen); - - if (len == EXT4_MAX_REC_LEN || len == 0) - return blocksize; - return (len & 65532) | ((len & 3) << 16); -} - -__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) -{ - if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) - BUG(); - if (len < 65536) - return cpu_to_le16(len); - if (len == blocksize) { - if (blocksize == 65536) - return cpu_to_le16(EXT4_MAX_REC_LEN); - else - return cpu_to_le16(0); - } - return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -} - /* * p is at least 6 bytes before the end of page */ @@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, + if (!ext4_check_dir_entry(dir, de, bh, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ @@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (!ext4_check_dir_entry("ext4_find_entry", - dir, de, bh, offset)) + if (!ext4_check_dir_entry(dir, de, bh, offset)) return -1; *res_dir = de; return 1; @@ -1019,7 +994,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + ((char *) de - bh->b_data); - if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { + if (!ext4_check_dir_entry(dir, de, bh, off)) { brelse(bh); *err = ERR_BAD_DX_DIR; goto errout; @@ -1088,7 +1063,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - struct inode *inode; static const struct qstr dotdot = { .name = "..", .len = 2, @@ -1097,7 +1071,6 @@ struct dentry *ext4_get_parent(struct dentry *child) struct buffer_head *bh; bh = ext4_find_entry(child->d_inode, &dotdot, &de); - inode = NULL; if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -1305,8 +1278,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { - if (!ext4_check_dir_entry("ext4_add_entry", dir, de, - bh, offset)) + if (!ext4_check_dir_entry(dir, de, bh, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -1673,7 +1645,7 @@ static int ext4_delete_entry(handle_t *handle, pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size) { - if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i)) + if (!ext4_check_dir_entry(dir, de, bh, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); @@ -1956,7 +1928,7 @@ static int empty_dir(struct inode *inode) } de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) { + if (!ext4_check_dir_entry(inode, de, bh, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6df797e..ca5c8aa 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -921,8 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) &sbi->s_flex_groups[flex_group].free_inodes); } - ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - sb->s_dirt = 1; + ext4_handle_dirty_super(handle, sb); exit_journal: mutex_unlock(&sbi->s_resize_lock); @@ -953,7 +952,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count) { ext4_fsblk_t o_blocks_count; - ext4_group_t o_groups_count; ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; @@ -965,7 +963,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, * yet: we're going to revalidate es->s_blocks_count after * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); - o_groups_count = EXT4_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", @@ -1045,13 +1042,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); - sb->s_dirt = 1; mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ ext4_add_groupblocks(handle, sb, o_blocks_count, add); + ext4_handle_dirty_super(handle, sb); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); if ((err = ext4_journal_stop(handle))) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4e8983a..8d65575 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -241,14 +241,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); /* Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ journal = EXT4_SB(sb)->s_journal; if (journal) { if (is_journal_aborted(journal)) { - ext4_abort(sb, __func__, "Detected aborted journal"); + ext4_abort(sb, "Detected aborted journal"); return ERR_PTR(-EROFS); } return jbd2_journal_start(journal, nblocks); @@ -262,7 +262,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) * that sync() will call the filesystem's write_super callback if * appropriate. */ -int __ext4_journal_stop(const char *where, handle_t *handle) +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) { struct super_block *sb; int err; @@ -279,12 +279,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle) if (!err) err = rc; if (err) - __ext4_std_error(sb, where, err); + __ext4_std_error(sb, where, line, err); return err; } -void ext4_journal_abort_handle(const char *caller, const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err) +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, + handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); @@ -300,12 +301,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s: aborting transaction: %s in %s\n", - caller, errstr, err_fn); + printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); } +static void __save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + es->s_last_error_time = cpu_to_le32(get_seconds()); + strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + strncpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; + es->s_first_error_block = es->s_last_error_block; + } + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); + es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); +} + +static void save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + __save_error_info(sb, func, line); + ext4_commit_super(sb, 1); +} + + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -323,11 +359,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, static void ext4_handle_error(struct super_block *sb) { - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - if (sb->s_flags & MS_RDONLY) return; @@ -342,19 +373,19 @@ static void ext4_handle_error(struct super_block *sb) ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); sb->s_flags |= MS_RDONLY; } - ext4_commit_super(sb, 1); if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } void __ext4_error(struct super_block *sb, const char *function, - const char *fmt, ...) + unsigned int line, const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", + sb->s_id, function, line, current->comm); vprintk(fmt, args); printk("\n"); va_end(args); @@ -362,14 +393,22 @@ void __ext4_error(struct super_block *sb, const char *function, ext4_handle_error(sb); } -void ext4_error_inode(const char *function, struct inode *inode, +void ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + es->s_last_error_block = cpu_to_le64(block); + save_error_info(inode->i_sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", - inode->i_sb->s_id, function, inode->i_ino, current->comm); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", + inode->i_sb->s_id, function, line, inode->i_ino); + if (block) + printk("block %llu: ", block); + printk("comm %s: ", current->comm); vprintk(fmt, args); printk("\n"); va_end(args); @@ -377,20 +416,26 @@ void ext4_error_inode(const char *function, struct inode *inode, ext4_handle_error(inode->i_sb); } -void ext4_error_file(const char *function, struct file *file, - const char *fmt, ...) +void ext4_error_file(struct file *file, const char *function, + unsigned int line, const char *fmt, ...) { va_list args; + struct ext4_super_block *es; struct inode *inode = file->f_dentry->d_inode; char pathname[80], *path; + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + save_error_info(inode->i_sb, function, line); va_start(args, fmt); path = d_path(&(file->f_path), pathname, sizeof(pathname)); if (!path) path = "(unknown)"; printk(KERN_CRIT - "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", - inode->i_sb->s_id, function, inode->i_ino, current->comm, path); + "EXT4-fs error (device %s): %s:%d: inode #%lu " + "(comm %s path %s): ", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path); vprintk(fmt, args); printk("\n"); va_end(args); @@ -435,7 +480,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ -void __ext4_std_error(struct super_block *sb, const char *function, int errno) +void __ext4_std_error(struct super_block *sb, const char *function, + unsigned int line, int errno) { char nbuf[16]; const char *errstr; @@ -448,8 +494,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno) return; errstr = ext4_decode_error(sb, errno, nbuf); - printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", - sb->s_id, function, errstr); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -464,29 +511,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno) * case we take the easy way out and panic immediately. */ -void ext4_abort(struct super_block *sb, const char *function, - const char *fmt, ...) +void __ext4_abort(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) { va_list args; + save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, + function, line); vprintk(fmt, args); printk("\n"); va_end(args); + if ((sb->s_flags & MS_RDONLY) == 0) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + save_error_info(sb, function, line); + } if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs panic from previous error\n"); - - if (sb->s_flags & MS_RDONLY) - return; - - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - sb->s_flags |= MS_RDONLY; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } void ext4_msg (struct super_block * sb, const char *prefix, @@ -502,38 +549,47 @@ void ext4_msg (struct super_block * sb, const char *prefix, } void __ext4_warning(struct super_block *sb, const char *function, - const char *fmt, ...) + unsigned int line, const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ", - sb->s_id, function); + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", + sb->s_id, function, line); vprintk(fmt, args); printk("\n"); va_end(args); } -void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, - const char *function, const char *fmt, ...) +void __ext4_grp_locked_error(const char *function, unsigned int line, + struct super_block *sb, ext4_group_t grp, + unsigned long ino, ext4_fsblk_t block, + const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { va_list args; struct ext4_super_block *es = EXT4_SB(sb)->s_es; + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + __save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", + sb->s_id, function, line, grp); + if (ino) + printk("inode %lu: ", ino); + if (block) + printk("block %llu:", (unsigned long long) block); vprintk(fmt, args); printk("\n"); va_end(args); if (test_opt(sb, ERRORS_CONT)) { - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); ext4_commit_super(sb, 0); return; } + ext4_unlock_group(sb, grp); ext4_handle_error(sb); /* @@ -660,8 +716,7 @@ static void ext4_put_super(struct super_block *sb) err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if (err < 0) - ext4_abort(sb, __func__, - "Couldn't clean up the journal"); + ext4_abort(sb, "Couldn't clean up the journal"); } ext4_release_system_zone(sb); @@ -946,14 +1001,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",journal_async_commit"); else if (test_opt(sb, JOURNAL_CHECKSUM)) seq_puts(seq, ",journal_checksum"); - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); - if (!test_opt(sb, DELALLOC)) + if (!test_opt(sb, DELALLOC) && + !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); - if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -977,7 +1030,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NO_AUTO_DA_ALLOC)) seq_puts(seq, ",noauto_da_alloc"); - if (test_opt(sb, DISCARD)) + if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sb, NOLOAD)) @@ -986,6 +1039,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DIOREAD_NOLOCK)) seq_puts(seq, ",dioread_nolock"); + if (test_opt(sb, BLOCK_VALIDITY) && + !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) + seq_puts(seq, ",block_validity"); + ext4_show_quota_options(seq, sb); return 0; @@ -1065,6 +1122,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path); +static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1086,7 +1144,7 @@ static const struct dquot_operations ext4_quota_operations = { static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, - .quota_off = dquot_quota_off, + .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_info = dquot_get_dqinfo, .set_info = dquot_set_dqinfo, @@ -1624,10 +1682,12 @@ set_qf_format: *n_blocks_count = option; break; case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated nobh option"); break; case Opt_bh: - clear_opt(sbi->s_mount_opt, NOBH); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated bh option"); break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); @@ -2249,6 +2309,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a, { struct super_block *sb = sbi->s_buddy_cache->i_sb; + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%lu\n", (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - sbi->s_sectors_written_start) >> 1); @@ -2259,6 +2321,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, { struct super_block *sb = sbi->s_buddy_cache->i_sb; + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - @@ -2431,6 +2495,53 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 1; } +/* + * This function is called once a day if we have errors logged + * on the file system + */ +static void print_daily_error_info(unsigned long arg) +{ + struct super_block *sb = (struct super_block *) arg; + struct ext4_sb_info *sbi; + struct ext4_super_block *es; + + sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (es->s_error_count) + ext4_msg(sb, KERN_NOTICE, "error count: %u", + le32_to_cpu(es->s_error_count)); + if (es->s_first_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_first_error_time), + (int) sizeof(es->s_first_error_func), + es->s_first_error_func, + le32_to_cpu(es->s_first_error_line)); + if (es->s_first_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_first_error_ino)); + if (es->s_first_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_first_error_block)); + printk("\n"); + } + if (es->s_last_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_last_error_time), + (int) sizeof(es->s_last_error_func), + es->s_last_error_func, + le32_to_cpu(es->s_last_error_line)); + if (es->s_last_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_last_error_ino)); + if (es->s_last_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_last_error_block)); + printk("\n"); + } + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2448,7 +2559,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; char *cp; const char *descr; - int ret = -EINVAL; + int ret = -ENOMEM; int blocksize; unsigned int db_count; unsigned int i; @@ -2459,13 +2570,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - return -ENOMEM; + goto out_free_orig; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) { kfree(sbi); - return -ENOMEM; + goto out_free_orig; } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; @@ -2473,8 +2584,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resgid = EXT4_DEF_RESGID; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; - sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, - sectors[1]); + if (sb->s_bdev->bd_part) + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev->bd_part, sectors[1]); unlock_kernel(); @@ -2482,6 +2594,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; + ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); @@ -2546,6 +2659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, ERRORS_CONT); else set_opt(sbi->s_mount_opt, ERRORS_RO); + if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) + set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + if (def_mount_opts & EXT4_DEFM_DISCARD) + set_opt(sbi->s_mount_opt, DISCARD); sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); @@ -2553,15 +2670,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - set_opt(sbi->s_mount_opt, BARRIER); + if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) + set_opt(sbi->s_mount_opt, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ - if (!IS_EXT3_SB(sb)) + if (!IS_EXT3_SB(sb) && + ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sbi->s_mount_opt, DELALLOC); + if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, + &journal_devnum, &journal_ioprio, NULL, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + sbi->s_es->s_mount_opts); + } if (!parse_options((char *) data, sb, &journal_devnum, &journal_ioprio, NULL, 0)) goto failed_mount; @@ -2912,18 +3037,7 @@ no_journal: ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount_wq; } - if (test_opt(sb, NOBH)) { - if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { - ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " - "its supported only with writeback mode"); - clear_opt(sbi->s_mount_opt, NOBH); - } - if (test_opt(sb, DIOREAD_NOLOCK)) { - ext4_msg(sb, KERN_WARNING, "dioread_nolock option is " - "not supported with nobh mode"); - goto failed_mount_wq; - } - } + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3010,7 +3124,7 @@ no_journal: ext4_ext_init(sb); err = ext4_mb_init(sb, needs_recovery); if (err) { - ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); goto failed_mount4; } @@ -3043,7 +3157,14 @@ no_journal: descr = "out journal"; ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s", descr, orig_data); + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + + init_timer(&sbi->s_err_report); + sbi->s_err_report.function = print_daily_error_info; + sbi->s_err_report.data = (unsigned long) sb; + if (es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ lock_kernel(); kfree(orig_data); @@ -3093,6 +3214,7 @@ out_fail: kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); +out_free_orig: kfree(orig_data); return ret; } @@ -3110,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else @@ -3119,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } static journal_t *ext4_get_journal(struct super_block *sb, @@ -3327,8 +3449,17 @@ static int ext4_load_journal(struct super_block *sb, if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) err = jbd2_journal_wipe(journal, !really_read_only); - if (!err) + if (!err) { + char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + if (save) + memcpy(save, ((char *) es) + + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); + if (save) + memcpy(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN); + kfree(save); + } if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); @@ -3384,10 +3515,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) */ if (!(sb->s_flags & MS_RDONLY)) es->s_wtime = cpu_to_le32(get_seconds()); - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + if (sb->s_bdev->bd_part) + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - EXT4_SB(sb)->s_sectors_written_start) >> 1)); + else + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( @@ -3491,7 +3626,7 @@ int ext4_force_commit(struct super_block *sb) journal = EXT4_SB(sb)->s_journal; if (journal) { - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); ret = ext4_journal_force_commit(journal); } @@ -3616,7 +3751,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) - ext4_abort(sb, __func__, "Abort forced by user"); + ext4_abort(sb, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3981,6 +4116,18 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return err; } +static int ext4_quota_off(struct super_block *sb, int type) +{ + /* Force all delayed allocation blocks to be allocated */ + if (test_opt(sb, DELALLOC)) { + down_read(&sb->s_umount); + sync_filesystem(sb); + up_read(&sb->s_umount); + } + + return dquot_quota_off(sb, type); +} + /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and noone else should touch the files) @@ -4030,7 +4177,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -4055,24 +4201,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; - if (journal_quota) { - err = ext4_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); flush_dcache_page(bh->b_page); unlock_buffer(bh); - if (journal_quota) - err = ext4_handle_dirty_metadata(handle, NULL, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext4_jbd2_file_inode(handle, inode); - mark_buffer_dirty(bh); - } + err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: if (err) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 0433800..a6f3142 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -458,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); - sb->s_dirt = 1; - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_super(handle, sb); } } @@ -178,7 +178,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->open_fds = (fd_set *)data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = (fd_set *)data; - INIT_RCU_HEAD(&fdt->rcu); fdt->next = NULL; return fdt; @@ -312,7 +311,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; new_fdt->open_fds = (fd_set *)&newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; - INIT_RCU_HEAD(&new_fdt->rcu); new_fdt->next = NULL; spin_lock(&oldf->file_lock); @@ -430,7 +428,6 @@ struct files_struct init_files = { .fd = &init_files.fd_array[0], .close_on_exec = (fd_set *)&init_files.close_on_exec_init, .open_fds = (fd_set *)&init_files.open_fds_init, - .rcu = RCU_HEAD_INIT, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 1e8af93..5132c99 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -135,7 +135,7 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data) } /** - * vxfs_read_super - read superblock into memory and initalize filesystem + * vxfs_read_super - read superblock into memory and initialize filesystem * @sbp: VFS superblock (to fill) * @dp: fs private mount data * @silent: do not complain loudly when sth is wrong diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0609607..d5be169 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -38,43 +38,18 @@ int nr_pdflush_threads; /* * Passed into wb_writeback(), essentially a subset of writeback_control */ -struct wb_writeback_args { +struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; -}; -/* - * Work items for the bdi_writeback threads - */ -struct bdi_work { struct list_head list; /* pending work list */ - struct rcu_head rcu_head; /* for RCU free/clear of work */ - - unsigned long seen; /* threads that have seen this work */ - atomic_t pending; /* number of threads still to do work */ - - struct wb_writeback_args args; /* writeback arguments */ - - unsigned long state; /* flag bits, see WS_* */ + struct completion *done; /* set if the caller waits */ }; -enum { - WS_INPROGRESS = 0, - WS_ONSTACK, -}; - -static inline void bdi_work_init(struct bdi_work *work, - struct wb_writeback_args *args) -{ - INIT_RCU_HEAD(&work->rcu_head); - work->args = *args; - __set_bit(WS_INPROGRESS, &work->state); -} - /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -87,49 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi) return !list_empty(&bdi->work_list); } -static void bdi_work_free(struct rcu_head *head) -{ - struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - - clear_bit(WS_INPROGRESS, &work->state); - smp_mb__after_clear_bit(); - wake_up_bit(&work->state, WS_INPROGRESS); - - if (!test_bit(WS_ONSTACK, &work->state)) - kfree(work); -} - -static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) +static void bdi_queue_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) { - /* - * The caller has retrieved the work arguments from this work, - * drop our reference. If this is the last ref, delete and free it - */ - if (atomic_dec_and_test(&work->pending)) { - struct backing_dev_info *bdi = wb->bdi; - - spin_lock(&bdi->wb_lock); - list_del_rcu(&work->list); - spin_unlock(&bdi->wb_lock); - - call_rcu(&work->rcu_head, bdi_work_free); - } -} - -static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) -{ - work->seen = bdi->wb_mask; - BUG_ON(!work->seen); - atomic_set(&work->pending, bdi->wb_cnt); - BUG_ON(!bdi->wb_cnt); - - /* - * list_add_tail_rcu() contains the necessary barriers to - * make sure the above stores are seen before the item is - * noticed on the list - */ spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&work->list, &bdi->work_list); + list_add_tail(&work->list, &bdi->work_list); spin_unlock(&bdi->wb_lock); /* @@ -146,55 +83,29 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) } } -/* - * Used for on-stack allocated work items. The caller needs to wait until - * the wb threads have acked the work before it's safe to continue. - */ -static void bdi_wait_on_work_done(struct bdi_work *work) +static void +__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + bool range_cyclic, bool for_background) { - wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); -} - -static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args) -{ - struct bdi_work *work; + struct wb_writeback_work *work; /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - bdi_work_init(work, args); - bdi_queue_work(bdi, work); - } else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + if (bdi->wb.task) + wake_up_process(bdi->wb.task); + return; } -} -/** - * bdi_queue_work_onstack - start and wait for writeback - * @sb: write inodes from this super_block - * - * Description: - * This function initiates writeback and waits for the operation to - * complete. Callers must hold the sb s_umount semaphore for - * reading, to avoid having the super disappear before we are done. - */ -static void bdi_queue_work_onstack(struct wb_writeback_args *args) -{ - struct bdi_work work; - - bdi_work_init(&work, args); - __set_bit(WS_ONSTACK, &work.state); + work->sync_mode = WB_SYNC_NONE; + work->nr_pages = nr_pages; + work->range_cyclic = range_cyclic; + work->for_background = for_background; - bdi_queue_work(args->sb->s_bdi, &work); - bdi_wait_on_work_done(&work); + bdi_queue_work(bdi, work); } /** @@ -210,13 +121,7 @@ static void bdi_queue_work_onstack(struct wb_writeback_args *args) */ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) { - struct wb_writeback_args args = { - .sync_mode = WB_SYNC_NONE, - .nr_pages = nr_pages, - .range_cyclic = 1, - }; - - bdi_alloc_queue_work(bdi, &args); + __bdi_start_writeback(bdi, nr_pages, true, false); } /** @@ -230,13 +135,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) */ void bdi_start_background_writeback(struct backing_dev_info *bdi) { - struct wb_writeback_args args = { - .sync_mode = WB_SYNC_NONE, - .nr_pages = LONG_MAX, - .for_background = 1, - .range_cyclic = 1, - }; - bdi_alloc_queue_work(bdi, &args); + __bdi_start_writeback(bdi, LONG_MAX, true, true); } /* @@ -554,29 +453,41 @@ static bool pin_sb_for_writeback(struct super_block *sb) /* * Write a portion of b_io inodes which belong to @sb. - * If @wbc->sb != NULL, then find and write all such + * + * If @only_this_sb is true, then find and write all such * inodes. Otherwise write only ones which go sequentially * in reverse order. + * * Return 1, if the caller writeback routine should be * interrupted. Otherwise return 0. */ -static int writeback_sb_inodes(struct super_block *sb, - struct bdi_writeback *wb, - struct writeback_control *wbc) +static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, + struct writeback_control *wbc, bool only_this_sb) { while (!list_empty(&wb->b_io)) { long pages_skipped; struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - if (wbc->sb && sb != inode->i_sb) { - /* super block given and doesn't - match, skip this inode */ - redirty_tail(inode); - continue; - } - if (sb != inode->i_sb) - /* finish with this superblock */ + + if (inode->i_sb != sb) { + if (only_this_sb) { + /* + * We only want to write back data for this + * superblock, move all inodes not belonging + * to it back onto the dirty list. + */ + redirty_tail(inode); + continue; + } + + /* + * The inode belongs to a different superblock. + * Bounce back to the caller to unpin this and + * pin the next superblock. + */ return 0; + } + if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; @@ -614,8 +525,8 @@ static int writeback_sb_inodes(struct super_block *sb, return 1; } -static void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) { int ret = 0; @@ -629,29 +540,12 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, struct inode, i_list); struct super_block *sb = inode->i_sb; - if (wbc->sb) { - /* - * We are requested to write out inodes for a specific - * superblock. This means we already have s_umount - * taken by the caller which also waits for us to - * complete the writeout. - */ - if (sb != wbc->sb) { - redirty_tail(inode); - continue; - } - - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - - ret = writeback_sb_inodes(sb, wb, wbc); - } else { - if (!pin_sb_for_writeback(sb)) { - requeue_io(inode); - continue; - } - ret = writeback_sb_inodes(sb, wb, wbc); - drop_super(sb); + if (!pin_sb_for_writeback(sb)) { + requeue_io(inode); + continue; } + ret = writeback_sb_inodes(sb, wb, wbc, false); + drop_super(sb); if (ret) break; @@ -660,11 +554,17 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, /* Leave any unwritten inodes on b_io */ } -void writeback_inodes_wbc(struct writeback_control *wbc) +static void __writeback_inodes_sb(struct super_block *sb, + struct bdi_writeback *wb, struct writeback_control *wbc) { - struct backing_dev_info *bdi = wbc->bdi; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); - writeback_inodes_wb(&bdi->wb, wbc); + wbc->wb_start = jiffies; /* livelock avoidance */ + spin_lock(&inode_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + writeback_sb_inodes(sb, wb, wbc, true); + spin_unlock(&inode_lock); } /* @@ -702,16 +602,14 @@ static inline bool over_bground_thresh(void) * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, - struct wb_writeback_args *args) + struct wb_writeback_work *work) { struct writeback_control wbc = { - .bdi = wb->bdi, - .sb = args->sb, - .sync_mode = args->sync_mode, + .sync_mode = work->sync_mode, .older_than_this = NULL, - .for_kupdate = args->for_kupdate, - .for_background = args->for_background, - .range_cyclic = args->range_cyclic, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, }; unsigned long oldest_jif; long wrote = 0; @@ -731,21 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Stop writeback when nr_pages has been consumed */ - if (args->nr_pages <= 0) + if (work->nr_pages <= 0) break; /* * For background writeout, stop when we are below the * background dirty threshold */ - if (args->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh()) break; wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; - writeback_inodes_wb(wb, &wbc); - args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + if (work->sb) + __writeback_inodes_sb(work->sb, wb, &wbc); + else + writeback_inodes_wb(wb, &wbc); + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* @@ -781,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb, } /* - * Return the next bdi_work struct that hasn't been processed by this - * wb thread yet. ->seen is initially set for each thread that exists - * for this device, when a thread first notices a piece of work it - * clears its bit. Depending on writeback type, the thread will notify - * completion on either receiving the work (WB_SYNC_NONE) or after - * it is done (WB_SYNC_ALL). + * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, - struct bdi_writeback *wb) +static struct wb_writeback_work * +get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) { - struct bdi_work *work, *ret = NULL; - - rcu_read_lock(); + struct wb_writeback_work *work = NULL; - list_for_each_entry_rcu(work, &bdi->work_list, list) { - if (!test_bit(wb->nr, &work->seen)) - continue; - clear_bit(wb->nr, &work->seen); - - ret = work; - break; + spin_lock(&bdi->wb_lock); + if (!list_empty(&bdi->work_list)) { + work = list_entry(bdi->work_list.next, + struct wb_writeback_work, list); + list_del_init(&work->list); } - - rcu_read_unlock(); - return ret; + spin_unlock(&bdi->wb_lock); + return work; } static long wb_check_old_data_flush(struct bdi_writeback *wb) @@ -830,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) (inodes_stat.nr_inodes - inodes_stat.nr_unused); if (nr_pages) { - struct wb_writeback_args args = { + struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, }; - return wb_writeback(wb, &args); + return wb_writeback(wb, &work); } return 0; @@ -849,33 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) long wb_do_writeback(struct bdi_writeback *wb, int force_wait) { struct backing_dev_info *bdi = wb->bdi; - struct bdi_work *work; + struct wb_writeback_work *work; long wrote = 0; while ((work = get_next_work_item(bdi, wb)) != NULL) { - struct wb_writeback_args args = work->args; - /* * Override sync mode, in case we must wait for completion + * because this thread is exiting now. */ if (force_wait) - work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; - - /* - * If this isn't a data integrity operation, just notify - * that we have seen this work and we are now starting it. - */ - if (!test_bit(WS_ONSTACK, &work->state)) - wb_clear_pending(wb, work); + work->sync_mode = WB_SYNC_ALL; - wrote += wb_writeback(wb, &args); + wrote += wb_writeback(wb, work); /* - * This is a data integrity writeback, so only do the - * notification when we have completed the work. + * Notify the caller of completion if this is a synchronous + * work item, otherwise just free it. */ - if (test_bit(WS_ONSTACK, &work->state)) - wb_clear_pending(wb, work); + if (work->done) + complete(work->done); + else + kfree(work); } /* @@ -938,14 +823,9 @@ int bdi_writeback_task(struct bdi_writeback *wb) void wakeup_flusher_threads(long nr_pages) { struct backing_dev_info *bdi; - struct wb_writeback_args args = { - .sync_mode = WB_SYNC_NONE, - }; - if (nr_pages) { - args.nr_pages = nr_pages; - } else { - args.nr_pages = global_page_state(NR_FILE_DIRTY) + + if (!nr_pages) { + nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); } @@ -953,7 +833,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - bdi_alloc_queue_work(bdi, &args); + __bdi_start_writeback(bdi, nr_pages, false, false); } rcu_read_unlock(); } @@ -1162,17 +1042,20 @@ void writeback_inodes_sb(struct super_block *sb) { unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - struct wb_writeback_args args = { + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, + .done = &done, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - args.nr_pages = nr_dirty + nr_unstable + + work.nr_pages = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - bdi_queue_work_onstack(&args); + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); } EXPORT_SYMBOL(writeback_inodes_sb); @@ -1204,16 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); */ void sync_inodes_sb(struct super_block *sb) { - struct wb_writeback_args args = { + DECLARE_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, + .done = &done, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work_onstack(&args); + bdi_queue_work(sb->s_bdi, &work); + wait_for_completion(&done); + wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index cc94bb9..3f6dfa9 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -1,7 +1,6 @@ config FSCACHE tristate "General filesystem local caching manager" - select SLOW_WORK help This option enables a generic filesystem caching manager that can be used by various network and other filesystems to cache data locally. diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index edd7434..6a02644 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup; extern unsigned fscache_defer_create; extern unsigned fscache_debug; extern struct kobject *fscache_root; +extern struct workqueue_struct *fscache_object_wq; +extern struct workqueue_struct *fscache_op_wq; +DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +static inline bool fscache_object_congested(void) +{ + return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); +} extern int fscache_wait_bit(void *); extern int fscache_wait_bit_interruptible(void *); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index add6bdb..f9d8567 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/completion.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include "internal.h" MODULE_DESCRIPTION("FS Cache Manager"); @@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug, "FS-Cache debugging mask"); struct kobject *fscache_root; +struct workqueue_struct *fscache_object_wq; +struct workqueue_struct *fscache_op_wq; + +DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +/* these values serve as lower bounds, will be adjusted in fscache_init() */ +static unsigned fscache_object_max_active = 4; +static unsigned fscache_op_max_active = 2; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fscache_sysctl_header; + +static int fscache_max_active_sysctl(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct workqueue_struct **wqp = table->extra1; + unsigned int *datap = table->data; + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0) + workqueue_set_max_active(*wqp, *datap); + return ret; +} + +ctl_table fscache_sysctls[] = { + { + .procname = "object_max_active", + .data = &fscache_object_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_object_wq, + }, + { + .procname = "operation_max_active", + .data = &fscache_op_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_op_wq, + }, + {} +}; + +ctl_table fscache_sysctls_root[] = { + { + .procname = "fscache", + .mode = 0555, + .child = fscache_sysctls, + }, + {} +}; +#endif /* * initialise the fs caching module */ static int __init fscache_init(void) { + unsigned int nr_cpus = num_possible_cpus(); + unsigned int cpu; int ret; - ret = slow_work_register_user(THIS_MODULE); - if (ret < 0) - goto error_slow_work; + fscache_object_max_active = + clamp_val(nr_cpus, + fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND, + fscache_object_max_active); + if (!fscache_object_wq) + goto error_object_wq; + + fscache_op_max_active = + clamp_val(fscache_object_max_active / 2, + fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND, + fscache_op_max_active); + if (!fscache_op_wq) + goto error_op_wq; + + for_each_possible_cpu(cpu) + init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); ret = fscache_proc_init(); if (ret < 0) goto error_proc; +#ifdef CONFIG_SYSCTL + ret = -ENOMEM; + fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root); + if (!fscache_sysctl_header) + goto error_sysctl; +#endif + fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", sizeof(struct fscache_cookie), 0, @@ -78,10 +162,16 @@ static int __init fscache_init(void) error_kobj: kmem_cache_destroy(fscache_cookie_jar); error_cookie_jar: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +error_sysctl: +#endif fscache_proc_cleanup(); error_proc: - slow_work_unregister_user(THIS_MODULE); -error_slow_work: + destroy_workqueue(fscache_op_wq); +error_op_wq: + destroy_workqueue(fscache_object_wq); +error_object_wq: return ret; } @@ -96,8 +186,12 @@ static void __exit fscache_exit(void) kobject_put(fscache_root); kmem_cache_destroy(fscache_cookie_jar); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +#endif fscache_proc_cleanup(); - slow_work_unregister_user(THIS_MODULE); + destroy_workqueue(fscache_op_wq); + destroy_workqueue(fscache_object_wq); printk(KERN_NOTICE "FS-Cache: Unloaded\n"); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 4a8eb31..ebe29c5 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -34,8 +34,8 @@ struct fscache_objlist_data { #define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ #define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ #define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ -#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */ -#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */ +#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ +#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ u8 buf[512]; /* key and aux data buffer */ }; @@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v) READS, NOREADS); FILTER(obj->events & obj->event_mask, EVENTS, NOEVENTS); - FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW), - WORK, NOWORK); + FILTER(work_busy(&obj->work), WORK, NOWORK); } seq_printf(m, - "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ", + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ", obj->debug_id, obj->parent ? obj->parent->debug_id : -1, fscache_object_states_short[obj->state], @@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, obj->events, obj->flags, - obj->work.flags); + work_busy(&obj->work)); no_cookie = true; keylen = auxlen = 0; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 0b589a9..b6b897c 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -14,7 +14,6 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> -#include <linux/seq_file.h> #include "internal.h" const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { @@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { [FSCACHE_OBJECT_DEAD] = "DEAD", }; -static void fscache_object_slow_work_put_ref(struct slow_work *); -static int fscache_object_slow_work_get_ref(struct slow_work *); -static void fscache_object_slow_work_execute(struct slow_work *); -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); -#endif +static int fscache_get_object(struct fscache_object *); +static void fscache_put_object(struct fscache_object *); static void fscache_initialise_object(struct fscache_object *); static void fscache_lookup_object(struct fscache_object *); static void fscache_object_available(struct fscache_object *); @@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *); static void fscache_enqueue_dependents(struct fscache_object *); static void fscache_dequeue_object(struct fscache_object *); -const struct slow_work_ops fscache_object_slow_work_ops = { - .owner = THIS_MODULE, - .get_ref = fscache_object_slow_work_get_ref, - .put_ref = fscache_object_slow_work_put_ref, - .execute = fscache_object_slow_work_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = fscache_object_slow_work_desc, -#endif -}; -EXPORT_SYMBOL(fscache_object_slow_work_ops); - /* * we need to notify the parent when an op completes that we had outstanding * upon it @@ -345,7 +329,7 @@ unsupported_event: /* * execute an object */ -static void fscache_object_slow_work_execute(struct slow_work *work) +void fscache_object_work_func(struct work_struct *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); @@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work) if (object->events & object->event_mask) fscache_enqueue_object(object); clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + fscache_put_object(object); } - -/* - * describe an object for slow-work debugging - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_object_slow_work_desc(struct slow_work *work, - struct seq_file *m) -{ - struct fscache_object *object = - container_of(work, struct fscache_object, work); - - seq_printf(m, "FSC: OBJ%x: %s", - object->debug_id, - fscache_object_states_short[object->state]); -} -#endif +EXPORT_SYMBOL(fscache_object_work_func); /* * initialise an object @@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object) _enter(""); ASSERT(object->cookie != NULL); ASSERT(object->cookie->parent != NULL); - ASSERT(list_empty(&object->work.link)); if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | (1 << FSCACHE_OBJECT_EV_RELEASE) | @@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object) object->parent = NULL; } - /* this just shifts the object release to the slow work processor */ - fscache_stat(&fscache_n_cop_put_object); - object->cache->ops->put_object(object); - fscache_stat_d(&fscache_n_cop_put_object); + /* this just shifts the object release to the work processor */ + fscache_put_object(object); _leave(""); } @@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache, } /* - * allow the slow work item processor to get a ref on an object + * get a ref on an object */ -static int fscache_object_slow_work_get_ref(struct slow_work *work) +static int fscache_get_object(struct fscache_object *object) { - struct fscache_object *object = - container_of(work, struct fscache_object, work); int ret; fscache_stat(&fscache_n_cop_grab_object); @@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work) } /* - * allow the slow work item processor to discard a ref on a work item + * discard a ref on a work item */ -static void fscache_object_slow_work_put_ref(struct slow_work *work) +static void fscache_put_object(struct fscache_object *object) { - struct fscache_object *object = - container_of(work, struct fscache_object, work); - fscache_stat(&fscache_n_cop_put_object); object->cache->ops->put_object(object); fscache_stat_d(&fscache_n_cop_put_object); @@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object) { _enter("{OBJ%x}", object->debug_id); - slow_work_enqueue(&object->work); + if (fscache_get_object(object) >= 0) { + wait_queue_head_t *cong_wq = + &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) + wake_up(cong_wq); + } else + fscache_put_object(object); + + put_cpu_var(fscache_object_cong_wait); + } +} + +/** + * fscache_object_sleep_till_congested - Sleep until object wq is congested + * @timoutp: Scheduler sleep timeout + * + * Allow an object handler to sleep until the object workqueue is congested. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * %true is returned if the object wq is congested, %false otherwise. + */ +bool fscache_object_sleep_till_congested(signed long *timeoutp) +{ + wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + + add_wait_queue_exclusive(cong_wq, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); + finish_wait(cong_wq, &wait); + + return fscache_object_congested(); } +EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); /* * enqueue the dependents of an object for metadata-type processing @@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object) /* sort onto appropriate lists */ fscache_enqueue_object(dep); - fscache_stat(&fscache_n_cop_put_object); - dep->cache->ops->put_object(dep); - fscache_stat_d(&fscache_n_cop_put_object); + fscache_put_object(dep); if (!list_empty(&object->dependents)) cond_resched_lock(&object->lock); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index f17cecaf..b9f34ea 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op) fscache_stat(&fscache_n_op_enqueue); switch (op->flags & FSCACHE_OP_TYPE) { - case FSCACHE_OP_FAST: - _debug("queue fast"); + case FSCACHE_OP_ASYNC: + _debug("queue async"); atomic_inc(&op->usage); - if (!schedule_work(&op->fast_work)) + if (!queue_work(fscache_op_wq, &op->work)) fscache_put_operation(op); break; - case FSCACHE_OP_SLOW: - _debug("queue slow"); - slow_work_enqueue(&op->slow_work); - break; case FSCACHE_OP_MYTHREAD: _debug("queue for caller's attention"); break; @@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work) } /* - * allow the slow work item processor to get a ref on an operation - */ -static int fscache_op_get_ref(struct slow_work *work) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - atomic_inc(&op->usage); - return 0; -} - -/* - * allow the slow work item processor to discard a ref on an operation - */ -static void fscache_op_put_ref(struct slow_work *work) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - fscache_put_operation(op); -} - -/* - * execute an operation using the slow thread pool to provide processing context - * - the caller holds a ref to this object, so we don't need to hold one + * execute an operation using fs_op_wq to provide processing context - + * the caller holds a ref to this object, so we don't need to hold one */ -static void fscache_op_execute(struct slow_work *work) +void fscache_op_work_func(struct work_struct *work) { struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); + container_of(work, struct fscache_operation, work); unsigned long start; _enter("{OBJ%x OP%x,%d}", @@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work) start = jiffies; op->processor(op); fscache_hist(fscache_ops_histogram, start); + fscache_put_operation(op); _leave(""); } - -/* - * describe an operation for slow-work debugging - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_op_desc(struct slow_work *work, struct seq_file *m) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx", - op->object->debug_id, op->debug_id, - op->name, op->state, op->flags); -} -#endif - -const struct slow_work_ops fscache_op_slow_work_ops = { - .owner = THIS_MODULE, - .get_ref = fscache_op_get_ref, - .put_ref = fscache_op_put_ref, - .execute = fscache_op_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = fscache_op_desc, -#endif -}; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 723b889..41c441c 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, page_busy: /* we might want to wait here, but that could deadlock the allocator as - * the slow-work threads writing to the cache may all end up sleeping + * the work threads writing to the cache may all end up sleeping * on memory allocation */ fscache_stat(&fscache_n_store_vmscan_busy); return false; @@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, NULL); - fscache_operation_init_slow(op, fscache_attr_changed_op); - op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE); + fscache_operation_init(op, fscache_attr_changed_op, NULL); + op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); fscache_set_op_name(op, "Attr"); spin_lock(&cookie->lock); @@ -218,24 +217,6 @@ nobufs: EXPORT_SYMBOL(__fscache_attr_changed); /* - * handle secondary execution given to a retrieval op on behalf of the - * cache - */ -static void fscache_retrieval_work(struct work_struct *work) -{ - struct fscache_retrieval *op = - container_of(work, struct fscache_retrieval, op.fast_work); - unsigned long start; - - _enter("{OP%x}", op->op.debug_id); - - start = jiffies; - op->op.processor(&op->op); - fscache_hist(fscache_ops_histogram, start); - fscache_put_operation(&op->op); -} - -/* * release a retrieval op reference */ static void fscache_release_retrieval_op(struct fscache_operation *_op) @@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; - INIT_WORK(&op->op.fast_work, fscache_retrieval_work); INIT_LIST_HEAD(&op->to_do); fscache_set_op_name(&op->op, "Retr"); return op; @@ -795,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_release_write_op); - fscache_operation_init_slow(&op->op, fscache_write_op); - op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); + fscache_operation_init(&op->op, fscache_write_op, + fscache_release_write_op); + op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); fscache_set_op_name(&op->op, "Write1"); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); @@ -852,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_store_ops); fscache_stat(&fscache_n_stores_ok); - /* the slow work queue now carries its own ref on the object */ + /* the work queue now carries its own ref on the object */ fscache_put_operation(&op->op); _leave(" = 0"); return 0; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9424796..69ad053 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc) static void queue_request(struct fuse_conn *fc, struct fuse_req *req) { - req->in.h.unique = fuse_get_unique(fc); req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fc->pending); @@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_entry(fc->bg_queue.next, struct fuse_req, list); list_del(&req->list); fc->active_background++; + req->in.h.unique = fuse_get_unique(fc); queue_request(fc, req); } } @@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) else if (fc->conn_error) req->out.h.error = -ECONNREFUSED; else { + req->in.h.unique = fuse_get_unique(fc); queue_request(fc, req); /* acquire extra reference, since request is still needed after request_end() */ @@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_request_send_background); +static int fuse_request_send_notify_reply(struct fuse_conn *fc, + struct fuse_req *req, u64 unique) +{ + int err = -ENODEV; + + req->isreply = 0; + req->in.h.unique = unique; + spin_lock(&fc->lock); + if (fc->connected) { + queue_request(fc, req); + err = 0; + } + spin_unlock(&fc->lock); + + return err; +} + /* * Called under fc->lock * @@ -535,13 +553,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (!cs->write) { buf->ops->unmap(cs->pipe, buf, cs->mapaddr); } else { - kunmap_atomic(cs->mapaddr, KM_USER0); + kunmap(buf->page); buf->len = PAGE_SIZE - cs->len; } cs->currbuf = NULL; cs->mapaddr = NULL; } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr, KM_USER0); + kunmap(cs->pg); if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); @@ -572,7 +590,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); cs->len = buf->len; cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; @@ -592,7 +610,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page, KM_USER0); + cs->mapaddr = kmap(page); cs->buf = cs->mapaddr; cs->len = PAGE_SIZE; cs->pipebufs++; @@ -611,7 +629,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) return err; BUG_ON(err != 1); offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); + cs->mapaddr = kmap(cs->pg); cs->buf = cs->mapaddr + offset; cs->len = min(PAGE_SIZE - offset, cs->seglen); cs->seglen -= cs->len; @@ -1231,6 +1249,199 @@ err: return err; } +static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_store_out outarg; + struct inode *inode; + struct address_space *mapping; + u64 nodeid; + int err; + pgoff_t index; + unsigned int offset; + unsigned int num; + loff_t file_size; + loff_t end; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto out_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto out_finish; + + err = -EINVAL; + if (size - sizeof(outarg) != outarg.size) + goto out_finish; + + nodeid = outarg.nodeid; + + down_read(&fc->killsb); + + err = -ENOENT; + if (!fc->sb) + goto out_up_killsb; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + goto out_up_killsb; + + mapping = inode->i_mapping; + index = outarg.offset >> PAGE_CACHE_SHIFT; + offset = outarg.offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + end = outarg.offset + outarg.size; + if (end > file_size) { + file_size = end; + fuse_write_update_size(inode, file_size); + } + + num = outarg.size; + while (num) { + struct page *page; + unsigned int this_num; + + err = -ENOMEM; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + if (!page) + goto out_iput; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + err = fuse_copy_page(cs, &page, offset, this_num, 0); + if (!err && offset == 0 && (num != 0 || file_size == end)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (err) + goto out_iput; + + num -= this_num; + offset = 0; + index++; + } + + err = 0; + +out_iput: + iput(inode); +out_up_killsb: + up_read(&fc->killsb); +out_finish: + fuse_copy_finish(cs); + return err; +} + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +{ + int i; + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + page_cache_release(page); + } +} + +static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, + struct fuse_notify_retrieve_out *outarg) +{ + int err; + struct address_space *mapping = inode->i_mapping; + struct fuse_req *req; + pgoff_t index; + loff_t file_size; + unsigned int num; + unsigned int offset; + size_t total_len; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + offset = outarg->offset & ~PAGE_CACHE_MASK; + + req->in.h.opcode = FUSE_NOTIFY_REPLY; + req->in.h.nodeid = outarg->nodeid; + req->in.numargs = 2; + req->in.argpages = 1; + req->page_offset = offset; + req->end = fuse_retrieve_end; + + index = outarg->offset >> PAGE_CACHE_SHIFT; + file_size = i_size_read(inode); + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; + + while (num) { + struct page *page; + unsigned int this_num; + + page = find_get_page(mapping, index); + if (!page) + break; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + req->pages[req->num_pages] = page; + req->num_pages++; + + num -= this_num; + total_len += this_num; + } + req->misc.retrieve_in.offset = outarg->offset; + req->misc.retrieve_in.size = total_len; + req->in.args[0].size = sizeof(req->misc.retrieve_in); + req->in.args[0].value = &req->misc.retrieve_in; + req->in.args[1].size = total_len; + + err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, req); + + return err; +} + +static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_retrieve_out outarg; + struct inode *inode; + int err; + + err = -EINVAL; + if (size != sizeof(outarg)) + goto copy_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto copy_finish; + + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + u64 nodeid = outarg.nodeid; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + err = fuse_retrieve(fc, inode, &outarg); + iput(inode); + } + } + up_read(&fc->killsb); + + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1244,6 +1455,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INVAL_ENTRY: return fuse_notify_inval_entry(fc, size, cs); + case FUSE_NOTIFY_STORE: + return fuse_notify_store(fc, size, cs); + + case FUSE_NOTIFY_RETRIEVE: + return fuse_notify_retrieve(fc, size, cs); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3cdc5f7..431be07 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1016,7 +1016,7 @@ static int fuse_permission(struct inode *inode, int mask) exist. So if permissions are revoked this won't be noticed immediately, only after the attribute timeout has expired */ - } else if (mask & MAY_ACCESS) { + } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { err = fuse_access(inode, mask); } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { if (!(inode->i_mode & S_IXUGO)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ada0ade..147c1f7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, return 0; } -static void fuse_write_update_size(struct inode *inode, loff_t pos) +void fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8f309f0..57d4a3a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -272,6 +272,7 @@ struct fuse_req { struct fuse_write_in in; struct fuse_write_out out; } write; + struct fuse_notify_retrieve_in retrieve_in; struct fuse_lk_in lk_in; } misc; @@ -748,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); +void fuse_write_update_size(struct inode *inode, loff_t pos); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index a47b431..cc96655 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -7,7 +7,6 @@ config GFS2_FS select IP_SCTP if DLM_SCTP select FS_POSIX_ACL select CRC32 - select SLOW_WORK select QUOTACTL help A cluster filesystem. diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9f8b525..5e96cbd 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -136,10 +136,7 @@ static int gfs2_writeback_writepage(struct page *page, if (ret <= 0) return ret; - ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); - if (ret == -EAGAIN) - ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc); - return ret; + return nobh_writepage(page, gfs2_get_block_noalloc, wbc); } /** @@ -637,9 +634,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, } } - error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); - if (error) - goto out_unlock; + alloc_required = gfs2_write_alloc_required(ip, pos, len); if (alloc_required || gfs2_is_jdata(ip)) gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 4a48c0f..6f48280 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1040,7 +1040,8 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) goto out; if (gfs2_is_stuffed(ip)) { - u64 dsize = size + sizeof(struct gfs2_inode); + u64 dsize = size + sizeof(struct gfs2_dinode); + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1243,13 +1244,12 @@ int gfs2_file_dealloc(struct gfs2_inode *ip) * @ip: the file being written to * @offset: the offset to write to * @len: the number of bytes being written - * @alloc_required: set to 1 if an alloc is required, 0 otherwise * - * Returns: errno + * Returns: 1 if an alloc is required, 0 otherwise */ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len, int *alloc_required) + unsigned int len) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head bh; @@ -1257,26 +1257,23 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, u64 lblock, lblock_stop, size; u64 end_of_file; - *alloc_required = 0; - if (!len) return 0; if (gfs2_is_stuffed(ip)) { if (offset + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) - *alloc_required = 1; + return 1; return 0; } - *alloc_required = 1; shift = sdp->sd_sb.sb_bsize_shift; BUG_ON(gfs2_is_dir(ip)); end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; if (lblock_stop > end_of_file) - return 0; + return 1; size = (lblock_stop - lblock) << shift; do { @@ -1284,12 +1281,11 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, bh.b_size = size; gfs2_block_map(&ip->i_inode, lblock, &bh, 0); if (!buffer_mapped(&bh)) - return 0; + return 1; size -= bh.b_size; lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - *alloc_required = 0; return 0; } diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index c983177..a20a521 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -52,6 +52,6 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size); int gfs2_truncatei_resume(struct gfs2_inode *ip); int gfs2_file_dealloc(struct gfs2_inode *ip); int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len, int *alloc_required); + unsigned int len); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 8295c5b..b9dd88a 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, unsigned totlen = be16_to_cpu(dent->de_rec_len); if (gfs2_dirent_sentinel(dent)) - actual = GFS2_DIRENT_SIZE(0); + actual = 0; if (totlen - actual >= required) return 1; return 0; @@ -955,7 +955,12 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) /* Change the pointers. Don't bother distinguishing stuffed from non-stuffed. This code is complicated enough already. */ - lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS | __GFP_NOFAIL); + lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS); + if (!lp) { + error = -ENOMEM; + goto fail_brelse; + } + /* Change the pointers */ for (x = 0; x < half_len; x++) lp[x] = cpu_to_be64(bn); @@ -1063,7 +1068,9 @@ static int dir_double_exhash(struct gfs2_inode *dip) /* Allocate both the "from" and "to" buffers in one big chunk */ - buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); + buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS); + if (!buf) + return -ENOMEM; for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { error = gfs2_dir_read_data(dip, (char *)buf, @@ -1231,6 +1238,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, return 0; } +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + return ptr; +} + +static void gfs2_free_sort_buffer(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir, int *copied, unsigned *depth, u64 leaf_no) @@ -1271,7 +1297,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, * 99 is the maximum number of entries that can fit in a single * leaf block. */ - larr = vmalloc((leaves + entries + 99) * sizeof(void *)); + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); if (!larr) goto out; darr = (const struct gfs2_dirent **)(larr + leaves); @@ -1282,7 +1308,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, do { error = get_leaf(ip, lfn, &bh); if (error) - goto out_kfree; + goto out_free; lf = (struct gfs2_leaf *)bh->b_data; lfn = be64_to_cpu(lf->lf_next); if (lf->lf_entries) { @@ -1291,7 +1317,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, gfs2_dirent_gather, NULL, &g); error = PTR_ERR(dent); if (IS_ERR(dent)) - goto out_kfree; + goto out_free; if (entries2 != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir " "leaf %llu, entries2 (%u) != " @@ -1300,7 +1326,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, entries2, g.offset); error = -EIO; - goto out_kfree; + goto out_free; } error = 0; larr[leaf++] = bh; @@ -1312,10 +1338,10 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, BUG_ON(entries2 != entries); error = do_filldir_main(ip, offset, opaque, filldir, darr, entries, copied); -out_kfree: +out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); - vfree(larr); + gfs2_free_sort_buffer(larr); out: return error; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index ed9a94f..4edd662 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -351,7 +351,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long last_index; u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; - int alloc_required = 0; struct gfs2_holder gh; struct gfs2_alloc *al; int ret; @@ -364,8 +363,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); - if (ret || !alloc_required) + if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) goto out_unlock; ret = -ENOMEM; al = gfs2_alloc_get(ip); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ddcdbf4..9adf8f9 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -328,6 +328,30 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) } /** + * do_error - Something unexpected has happened during a lock request + * + */ + +static inline void do_error(struct gfs2_glock *gl, const int ret) +{ + struct gfs2_holder *gh, *tmp; + + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; + else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) + gh->gh_error = GLR_TRYFAILED; + else + continue; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + } +} + +/** * do_promote - promote as many requests as possible on the current queue * @gl: The glock * @@ -375,36 +399,13 @@ restart: } if (gh->gh_list.prev == &gl->gl_holders) return 1; + do_error(gl, 0); break; } return 0; } /** - * do_error - Something unexpected has happened during a lock request - * - */ - -static inline void do_error(struct gfs2_glock *gl, const int ret) -{ - struct gfs2_holder *gh, *tmp; - - list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) - continue; - if (ret & LM_OUT_ERROR) - gh->gh_error = -EIO; - else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) - gh->gh_error = GLR_TRYFAILED; - else - continue; - list_del_init(&gh->gh_list); - trace_gfs2_glock_queue(gh, 0); - gfs2_holder_wake(gh); - } -} - -/** * find_first_waiter - find the first gh that's waiting for the glock * @gl: the glock */ @@ -1062,6 +1063,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); add_to_queue(gh); + if ((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); run_queue(gl, 1); spin_unlock(&gl->gl_spin); @@ -1319,6 +1323,36 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) } /** + * gfs2_should_freeze - Figure out if glock should be frozen + * @gl: The glock in question + * + * Glocks are not frozen if (a) the result of the dlm operation is + * an error, (b) the locking operation was an unlock operation or + * (c) if there is a "noexp" flagged request anywhere in the queue + * + * Returns: 1 if freezing should occur, 0 otherwise + */ + +static int gfs2_should_freeze(const struct gfs2_glock *gl) +{ + const struct gfs2_holder *gh; + + if (gl->gl_reply & ~LM_OUT_ST_MASK) + return 0; + if (gl->gl_target == LM_ST_UNLOCKED) + return 0; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (LM_FLAG_NOEXP & gh->gh_flags) + return 0; + } + + return 1; +} + +/** * gfs2_glock_complete - Callback used by locking * @gl: Pointer to the glock * @ret: The return value from the dlm @@ -1328,18 +1362,17 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + gl->gl_reply = ret; + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { - struct gfs2_holder *gh; spin_lock(&gl->gl_spin); - gh = find_first_waiter(gl); - if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) && - (gl->gl_target != LM_ST_UNLOCKED)) || - ((ret & ~LM_OUT_ST_MASK) != 0)) + if (gfs2_should_freeze(gl)) { set_bit(GLF_FROZEN, &gl->gl_flags); - spin_unlock(&gl->gl_spin); - if (test_bit(GLF_FROZEN, &gl->gl_flags)) + spin_unlock(&gl->gl_spin); return; + } + spin_unlock(&gl->gl_spin); } set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gfs2_glock_hold(gl); @@ -1348,7 +1381,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b5d7363..fdbf4b3 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -12,7 +12,6 @@ #include <linux/fs.h> #include <linux/workqueue.h> -#include <linux/slow-work.h> #include <linux/dlm.h> #include <linux/buffer_head.h> @@ -383,7 +382,7 @@ struct gfs2_journal_extent { struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; - struct slow_work jd_work; + struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; #define JDF_RECOVERY 1 @@ -460,6 +459,7 @@ enum { SDF_NOBARRIERS = 3, SDF_NORECOVERY = 4, SDF_DEMOTE = 5, + SDF_NOJOURNALID = 6, }; #define GFS2_FSNAME_LEN 256 diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index b5612cb..f03afd9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, { struct inode *inode; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; inode = gfs2_iget(sb, no_addr); @@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; if ((type == DT_UNKNOWN) && (no_formal_ino == 0)) goto gfs2_nfsbypass; @@ -228,7 +229,8 @@ gfs2_nfsbypass: fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: if (inode->i_state & I_NEW) ip->i_gl->gl_object = NULL; @@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) { struct gfs2_sbd *sdp; struct gfs2_inode *ip; - struct gfs2_glock *io_gl; + struct gfs2_glock *io_gl = NULL; int error; struct gfs2_holder gh; struct inode *inode; @@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); + io_gl = NULL; inode->i_mode = DT2IF(DT_UNKNOWN); @@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) fail_glock: gfs2_glock_dq(&ip->i_iopen_gh); fail_iopen: - gfs2_glock_put(io_gl); + if (io_gl) + gfs2_glock_put(io_gl); fail_put: ip->i_gl->gl_object = NULL; gfs2_glock_put(ip->i_gl); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index fb2a5f9..b1e9630 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -15,7 +15,6 @@ #include <linux/init.h> #include <linux/gfs2_ondisk.h> #include <asm/atomic.h> -#include <linux/slow-work.h> #include "gfs2.h" #include "incore.h" @@ -24,6 +23,7 @@ #include "util.h" #include "glock.h" #include "quota.h" +#include "recovery.h" static struct shrinker qd_shrinker = { .shrink = gfs2_shrink_qd_memory, @@ -138,9 +138,11 @@ static int __init init_gfs2_fs(void) if (error) goto fail_unregister; - error = slow_work_register_user(THIS_MODULE); - if (error) - goto fail_slow; + error = -ENOMEM; + gfs_recovery_wq = alloc_workqueue("gfs_recovery", + WQ_NON_REENTRANT | WQ_RESCUER, 0); + if (!gfs_recovery_wq) + goto fail_wq; gfs2_register_debugfs(); @@ -148,7 +150,7 @@ static int __init init_gfs2_fs(void) return 0; -fail_slow: +fail_wq: unregister_filesystem(&gfs2meta_fs_type); fail_unregister: unregister_filesystem(&gfs2_fs_type); @@ -190,7 +192,7 @@ static void __exit exit_gfs2_fs(void) gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); - slow_work_unregister_user(THIS_MODULE); + destroy_workqueue(gfs_recovery_wq); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3593b3a..4f44bde 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -17,7 +17,6 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/gfs2_ondisk.h> -#include <linux/slow-work.h> #include <linux/quotaops.h> #include "gfs2.h" @@ -76,7 +75,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) sb->s_fs_info = sdp; sdp->sd_vfs = sb; - + set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); init_waitqueue_head(&sdp->sd_glock_wait); @@ -673,7 +672,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; INIT_LIST_HEAD(&jd->extent_list); - slow_work_init(&jd->jd_work, &gfs2_recover_ops); + INIT_WORK(&jd->jd_work, gfs2_recover_func); jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { if (!jd->jd_inode) @@ -782,7 +781,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_lockstruct.ls_first) { unsigned int x; for (x = 0; x < sdp->sd_journals; x++) { - error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); + error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x), + true); if (error) { fs_err(sdp, "error recovering journal %u: %d\n", x, error); @@ -792,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) gfs2_others_may_mount(sdp); } else if (!sdp->sd_args.ar_spectator) { - error = gfs2_recover_journal(sdp->sd_jdesc); + error = gfs2_recover_journal(sdp->sd_jdesc, true); if (error) { fs_err(sdp, "error recovering my journal: %d\n", error); goto fail_jinode_gh; @@ -1050,7 +1050,8 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ret = match_int(&tmp[0], &option); if (ret || option < 0) goto hostdata_error; - ls->ls_jid = option; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + ls->ls_jid = option; break; case Opt_id: /* Obsolete, but left for backward compat purposes */ @@ -1102,6 +1103,24 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } +static int gfs2_journalid_wait(void *word) +{ + if (signal_pending(current)) + return -EINTR; + schedule(); + return 0; +} + +static int wait_on_journal(struct gfs2_sbd *sdp) +{ + if (sdp->sd_args.ar_spectator) + return 0; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 0; + + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); +} + void gfs2_online_uevent(struct gfs2_sbd *sdp) { struct super_block *sb = sdp->sd_vfs; @@ -1194,6 +1213,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (error) goto fail_locking; + error = wait_on_journal(sdp); + if (error) + goto fail_sb; + error = init_inodes(sdp, DO); if (error) goto fail_sb; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 49667d6..1bc6b56 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; @@ -694,10 +694,8 @@ get_a_page: if (!buffer_mapped(bh)) goto unlock_out; /* If it's a newly allocated disk block for quota, zero it */ - if (buffer_new(bh)) { - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - } + if (buffer_new(bh)) + zero_user(page, pos - blocksize, bh->b_size); } if (PageUptodate(page)) @@ -723,7 +721,7 @@ get_a_page: /* If quota straddles page boundary, we need to update the rest of the * quota at the beginning of the next page */ - if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */ + if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { ptr = ptr + nbytes; nbytes = sizeof(struct gfs2_quota) - nbytes; offset = 0; @@ -789,15 +787,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) goto out; for (x = 0; x < num_qd; x++) { - int alloc_required; - offset = qd2offset(qda[x]); - error = gfs2_write_alloc_required(ip, offset, - sizeof(struct gfs2_quota), - &alloc_required); - if (error) - goto out_gunlock; - if (alloc_required) + if (gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota))) nalloc++; } @@ -1457,10 +1449,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb, switch (sdp->sd_args.ar_quota) { case GFS2_QUOTA_ON: - fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); + fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD); /*FALLTHRU*/ case GFS2_QUOTA_ACCOUNT: - fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT); break; case GFS2_QUOTA_OFF: break; @@ -1506,7 +1498,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; fdq->d_version = FS_DQUOT_VERSION; - fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA; + fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = id; fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); @@ -1541,12 +1533,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, switch(type) { case USRQUOTA: type = QUOTA_USER; - if (fdq->d_flags != XFS_USER_QUOTA) + if (fdq->d_flags != FS_USER_QUOTA) return -EINVAL; break; case GRPQUOTA: type = QUOTA_GROUP; - if (fdq->d_flags != XFS_GROUP_QUOTA) + if (fdq->d_flags != FS_GROUP_QUOTA) return -EINVAL; break; default: @@ -1586,10 +1578,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, goto out_i; offset = qd2offset(qd); - error = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota), - &alloc_required); - if (error) - goto out_i; + alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); if (alloc_required) { al = gfs2_alloc_get(ip); if (al == NULL) diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 195f60c..e7d236c 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 4b9bece..f7f89a9 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -14,7 +14,6 @@ #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/slow-work.h> #include "gfs2.h" #include "incore.h" @@ -28,6 +27,8 @@ #include "util.h" #include "dir.h" +struct workqueue_struct *gfs_recovery_wq; + int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh) { @@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } -static int gfs2_recover_get_ref(struct slow_work *work) -{ - struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); - if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) - return -EBUSY; - return 0; -} - -static void gfs2_recover_put_ref(struct slow_work *work) -{ - struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); - clear_bit(JDF_RECOVERY, &jd->jd_flags); - smp_mb__after_clear_bit(); - wake_up_bit(&jd->jd_flags, JDF_RECOVERY); -} - -static void gfs2_recover_work(struct slow_work *work) +void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); struct gfs2_inode *ip = GFS2_I(jd->jd_inode); @@ -578,7 +563,7 @@ static void gfs2_recover_work(struct slow_work *work) gfs2_glock_dq_uninit(&j_gh); fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); - return; + goto done; fail_gunlock_tr: gfs2_glock_dq_uninit(&t_gh); @@ -590,32 +575,35 @@ fail_gunlock_j: } fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); - fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); +done: + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } -struct slow_work_ops gfs2_recover_ops = { - .owner = THIS_MODULE, - .get_ref = gfs2_recover_get_ref, - .put_ref = gfs2_recover_put_ref, - .execute = gfs2_recover_work, -}; - - static int gfs2_recovery_wait(void *word) { schedule(); return 0; } -int gfs2_recover_journal(struct gfs2_jdesc *jd) +int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; - rv = slow_work_enqueue(&jd->jd_work); - if (rv) - return rv; - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); + + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; + + /* we have JDF_RECOVERY, queue should always succeed */ + rv = queue_work(gfs_recovery_wq, &jd->jd_work); + BUG_ON(!rv); + + if (wait) + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + TASK_UNINTERRUPTIBLE); + return 0; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 1616ac2..2226136 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -12,6 +12,8 @@ #include "incore.h" +extern struct workqueue_struct *gfs_recovery_wq; + static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) { if (++*blk == sdp->sd_jdesc->jd_blocks) @@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); -extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); -extern struct slow_work_ops gfs2_recover_ops; +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +extern void gfs2_recover_func(struct work_struct *work); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4d1aad3..4140811 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -342,8 +342,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - int ar; - int error; if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { @@ -352,13 +350,12 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) } jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; - error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar); - if (!error && ar) { + if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { gfs2_consist_inode(ip); - error = -EIO; + return -EIO; } - return error; + return 0; } /** diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 37f5393..ccacffd 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -25,6 +25,7 @@ #include "quota.h" #include "util.h" #include "glops.h" +#include "recovery.h" struct gfs2_attr { struct attribute attr; @@ -325,6 +326,30 @@ static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%d\n", ls->ls_first); } +static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned first; + int rv; + + rv = sscanf(buf, "%u", &first); + if (rv != 1 || first > 1) + return -EINVAL; + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + sdp->sd_lockstruct.ls_first = first; + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -352,7 +377,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { if (jd->jd_jid != jid) continue; - rv = slow_work_enqueue(&jd->jd_work); + rv = gfs2_recover_journal(jd, false); break; } out: @@ -377,14 +402,41 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); } +static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + spin_lock(&sdp->sd_jindex_spin); + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + rv = -EBUSY; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + sdp->sd_lockstruct.ls_jid = jid; + smp_mb__after_clear_bit(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(jid, 0444, jid_show, NULL); -GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(jid, 0644, jid_show, jid_store); +GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); GDLM_ATTR(first_done, 0444, first_done_show, NULL); GDLM_ATTR(recover, 0600, NULL, recover_store); GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); @@ -564,7 +616,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); - if (!sdp->sd_args.ar_spectator) + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); if (gfs2_uuid_valid(uuid)) add_uevent_var(env, "UUID=%pUB", uuid); @@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, gfp_t gfp_mask) +static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { /* diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 93d1e47..f19ce94 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1281,13 +1281,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat, int journal_check_available_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { - journal_superblock_t *sb; - if (!compat && !ro && !incompat) return 1; - sb = journal->j_superblock; - /* We can support any known requested features iff the * superblock is in version 2. Otherwise we fail to support any * extended sb features. */ @@ -1481,7 +1477,6 @@ int journal_flush(journal_t *journal) int journal_wipe(journal_t *journal, int write) { - journal_superblock_t *sb; int err = 0; J_ASSERT (!(journal->j_flags & JFS_LOADED)); @@ -1490,8 +1485,6 @@ int journal_wipe(journal_t *journal, int write) if (err) return err; - sb = journal->j_superblock; - if (!journal->j_tail) goto no_recovery; diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 54c9bc9..81051da 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -283,12 +283,9 @@ int journal_recover(journal_t *journal) int journal_skip_recovery(journal_t *journal) { int err; - journal_superblock_t * sb; - struct recovery_info info; memset (&info, 0, sizeof(info)); - sb = journal->j_superblock; err = do_one_pass(journal, &info, PASS_SCAN); @@ -297,7 +294,8 @@ int journal_skip_recovery(journal_t *journal) ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD_DEBUG - int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); #endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", @@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal, unsigned int sequence; int blocktype; - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; - MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / sizeof(journal_block_tag_t)); - /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 076d1cc..1c23a0f 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh) void __jbd2_log_wait_for_space(journal_t *journal) { int nblocks, space_left; - assert_spin_locked(&journal->j_state_lock); + /* assert_spin_locked(&journal->j_state_lock); */ nblocks = jbd_space_needed(journal); while (__jbd2_log_space_left(journal) < nblocks) { if (journal->j_flags & JBD2_ABORT) return; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); mutex_lock(&journal->j_checkpoint_mutex); /* @@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) * filesystem, so abort the journal and leave a stack * trace for forensic evidence. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); space_left = __jbd2_log_space_left(journal); @@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) if (journal->j_committing_transaction) tid = journal->j_committing_transaction->t_tid; spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); } else if (jbd2_cleanup_journal_tail(journal) == 0) { @@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) WARN_ON(1); jbd2_journal_abort(journal, 0); } - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); } @@ -474,7 +474,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * next transaction ID we will write, and where it will * start. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; if (transaction) { @@ -496,7 +496,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* If the oldest pinned transaction is at the tail of the log already then there's not much we can do right now. */ if (journal->j_tail_sequence == first_tid) { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 1; } @@ -516,7 +516,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) journal->j_free += freed; journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* * If there is an external journal, we need to make sure that @@ -775,7 +775,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); J_ASSERT(transaction->t_checkpoint_io_list == NULL); - J_ASSERT(transaction->t_updates == 0); + J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 75716d3..f52e5e8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -150,11 +150,11 @@ static int journal_submit_commit_record(journal_t *journal, */ if (ret == -EOPNOTSUPP && barrier_done) { printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + "JBD2: Disabling barriers on %s, " + "not supported by device\n", journal->j_devname); + write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* And try again, without the barrier */ lock_buffer(bh); @@ -180,11 +180,11 @@ retry: wait_on_buffer(bh); if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { printk(KERN_WARNING - "JBD2: wait_on_commit_record: sync failed on %s - " - "disabling barriers\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + "JBD2: %s: disabling barries on %s - not supported " + "by device\n", __func__, journal->j_devname); + write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); lock_buffer(bh); clear_buffer_dirty(bh); @@ -400,7 +400,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; /* @@ -417,23 +417,23 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_locked); spin_lock(&commit_transaction->t_handle_lock); - while (commit_transaction->t_updates) { + while (atomic_read(&commit_transaction->t_updates)) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); - if (commit_transaction->t_updates) { + if (atomic_read(&commit_transaction->t_updates)) { spin_unlock(&commit_transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); - J_ASSERT (commit_transaction->t_outstanding_credits <= + J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); /* @@ -497,7 +497,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); @@ -519,19 +519,20 @@ void jbd2_journal_commit_transaction(journal_t *journal) * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); trace_jbd2_commit_logging(journal, commit_transaction); stats.run.rs_logging = jiffies; stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, stats.run.rs_logging); - stats.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.run.rs_blocks = + atomic_read(&commit_transaction->t_outstanding_credits); stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= - commit_transaction->t_outstanding_credits); + atomic_read(&commit_transaction->t_outstanding_credits)); err = 0; descriptor = NULL; @@ -616,7 +617,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * the free space in the log, but this counter is changed * by jbd2_journal_next_log_block() also. */ - commit_transaction->t_outstanding_credits--; + atomic_dec(&commit_transaction->t_outstanding_credits); /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get @@ -977,7 +978,7 @@ restart_loop: * __jbd2_journal_drop_transaction(). Otherwise we could race with * other checkpointing code processing the transaction... */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); /* * Now recheck if some buffers did not get attached to the transaction @@ -985,7 +986,7 @@ restart_loop: */ if (commit_transaction->t_forget) { spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); goto restart_loop; } @@ -1003,7 +1004,8 @@ restart_loop: * File the transaction statistics */ stats.ts_tid = commit_transaction->t_tid; - stats.run.rs_handle_count = commit_transaction->t_handle_count; + stats.run.rs_handle_count = + atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); @@ -1037,7 +1039,7 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bc2ff59..ad5866a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -41,6 +41,7 @@ #include <linux/hash.h> #include <linux/log2.h> #include <linux/vmalloc.h> +#include <linux/backing-dev.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -48,8 +49,6 @@ #include <asm/uaccess.h> #include <asm/page.h> -EXPORT_SYMBOL(jbd2_journal_start); -EXPORT_SYMBOL(jbd2_journal_restart); EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); EXPORT_SYMBOL(jbd2_journal_lock_updates); @@ -143,7 +142,7 @@ static int kjournald2(void *arg) /* * And now, wait forever for commit wakeup events. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); loop: if (journal->j_flags & JBD2_UNMOUNT) @@ -154,10 +153,10 @@ loop: if (journal->j_commit_sequence != journal->j_commit_request) { jbd_debug(1, "OK, requests differ\n"); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); goto loop; } @@ -169,9 +168,9 @@ loop: * be already stopped. */ jbd_debug(1, "Now suspending kjournald2\n"); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); refrigerator(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } else { /* * We assume on resume that commits are already there, @@ -191,9 +190,9 @@ loop: if (journal->j_flags & JBD2_UNMOUNT) should_sleep = 0; if (should_sleep) { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } finish_wait(&journal->j_wait_commit, &wait); } @@ -211,7 +210,7 @@ loop: goto loop; end_loop: - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); @@ -234,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal) static void journal_kill_thread(journal_t *journal) { - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* @@ -297,7 +296,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); - struct jbd2_buffer_trigger_type *triggers; journal_t *journal = transaction->t_journal; /* @@ -311,7 +309,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); - new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); +retry_alloc: + new_bh = alloc_buffer_head(GFP_NOFS); + if (!new_bh) { + /* + * Failure is not an option, but __GFP_NOFAIL is going + * away; so we retry ourselves here. + */ + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_alloc; + } + /* keep subsequent assertions sane */ new_bh->b_state = 0; init_buffer(new_bh, NULL, NULL); @@ -328,21 +336,21 @@ repeat: done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = offset_in_page(jh_in->b_frozen_data); - triggers = jh_in->b_frozen_triggers; } else { new_page = jh2bh(jh_in)->b_page; new_offset = offset_in_page(jh2bh(jh_in)->b_data); - triggers = jh_in->b_triggers; } mapped_data = kmap_atomic(new_page, KM_USER0); /* - * Fire any commit trigger. Do this before checking for escaping, - * as the trigger may modify the magic offset. If a copy-out - * happens afterwards, it will have the correct data in the buffer. + * Fire data frozen trigger if data already wasn't frozen. Do this + * before checking for escaping, as the trigger may modify the magic + * offset. If a copy-out happens afterwards, it will have the correct + * data in the buffer. */ - jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, - triggers); + if (!done_copy_out) + jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jh_in->b_triggers); /* * Check for escaping @@ -443,7 +451,7 @@ int __jbd2_log_space_left(journal_t *journal) { int left = journal->j_free; - assert_spin_locked(&journal->j_state_lock); + /* assert_spin_locked(&journal->j_state_lock); */ /* * Be pessimistic here about the number of those free blocks which @@ -488,9 +496,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid) { int ret; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); ret = __jbd2_log_start_commit(journal, tid); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } @@ -509,7 +517,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction_t *transaction = NULL; tid_t tid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { transaction = journal->j_running_transaction; __jbd2_log_start_commit(journal, transaction->t_tid); @@ -517,12 +525,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction = journal->j_committing_transaction; if (!transaction) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); return 0; /* Nothing to retry */ } tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); return 1; } @@ -536,7 +544,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { int ret = 0; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; @@ -555,7 +563,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) *ptid = journal->j_committing_transaction->t_tid; ret = 1; } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } @@ -567,26 +575,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; + read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG - spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_EMERG "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } - spin_unlock(&journal->j_state_lock); #endif - spin_lock(&journal->j_state_lock); while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); if (unlikely(is_journal_aborted(journal))) { printk(KERN_EMERG "journal commit I/O error\n"); @@ -603,7 +609,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) { unsigned long blocknr; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); blocknr = journal->j_head; @@ -611,7 +617,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) journal->j_free--; if (journal->j_head == journal->j_last) journal->j_head = journal->j_first; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return jbd2_journal_bmap(journal, blocknr, retp); } @@ -831,7 +837,7 @@ static journal_t * journal_init_common (void) mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); - spin_lock_init(&journal->j_state_lock); + rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; @@ -1097,14 +1103,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(journal->j_tail); sb->s_errno = cpu_to_be32(journal->j_errno); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); @@ -1125,12 +1131,12 @@ out: * any future commit will have to be careful to update the * superblock again to re-record the true start of the log. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (sb->s_start) journal->j_flags &= ~JBD2_FLUSHED; else journal->j_flags |= JBD2_FLUSHED; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* @@ -1392,13 +1398,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { - journal_superblock_t *sb; - if (!compat && !ro && !incompat) return 1; - sb = journal->j_superblock; - /* We can support any known requested features iff the * superblock is in version 2. Otherwise we fail to support any * extended sb features. */ @@ -1546,7 +1548,7 @@ int jbd2_journal_flush(journal_t *journal) transaction_t *transaction = NULL; unsigned long old_tail; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); /* Force everything buffered to the log... */ if (journal->j_running_transaction) { @@ -1559,10 +1561,10 @@ int jbd2_journal_flush(journal_t *journal) if (transaction) { tid_t tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); } else { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* ...and flush everything in the log out to disk. */ @@ -1586,12 +1588,12 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); old_tail = journal->j_tail; journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd2_journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_tail = old_tail; J_ASSERT(!journal->j_running_transaction); @@ -1599,7 +1601,7 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(!journal->j_checkpoint_transactions); J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 0; } @@ -1618,7 +1620,6 @@ int jbd2_journal_flush(journal_t *journal) int jbd2_journal_wipe(journal_t *journal, int write) { - journal_superblock_t *sb; int err = 0; J_ASSERT (!(journal->j_flags & JBD2_LOADED)); @@ -1627,8 +1628,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (err) return err; - sb = journal->j_superblock; - if (!journal->j_tail) goto no_recovery; @@ -1666,12 +1665,12 @@ void __jbd2_journal_abort_hard(journal_t *journal) printk(KERN_ERR "Aborting journal on device %s.\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_ABORT; transaction = journal->j_running_transaction; if (transaction) __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* Soft abort: record the abort error status in the journal superblock, @@ -1756,12 +1755,12 @@ int jbd2_journal_errno(journal_t *journal) { int err; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else err = journal->j_errno; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); return err; } @@ -1776,12 +1775,12 @@ int jbd2_journal_clear_err(journal_t *journal) { int err = 0; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else journal->j_errno = 0; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return err; } @@ -1794,10 +1793,10 @@ int jbd2_journal_clear_err(journal_t *journal) */ void jbd2_journal_ack_err(journal_t *journal) { - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_errno) journal->j_flags |= JBD2_ACK_ERR; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } int jbd2_journal_blocks_per_page(struct inode *inode) @@ -2202,8 +2201,6 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode) { - int writeout = 0; - if (!journal) return; restart: @@ -2220,9 +2217,6 @@ restart: goto restart; } - /* Do we need to wait for data writeback? */ - if (journal->j_committing_transaction == jinode->i_transaction) - writeout = 1; if (jinode->i_transaction) { list_del(&jinode->i_list); jinode->i_transaction = NULL; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 049281b..2bc4d5f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal) int jbd2_journal_skip_recovery(journal_t *journal) { int err; - journal_superblock_t * sb; struct recovery_info info; memset (&info, 0, sizeof(info)); - sb = journal->j_superblock; err = do_one_pass(journal, &info, PASS_SCAN); @@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal) ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD2_DEBUG - int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); #endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", @@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal, int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; - MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / tag_bytes); - /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e214d68..d95cc9d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -26,6 +26,8 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> +#include <linux/backing-dev.h> +#include <linux/module.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); + atomic_set(&transaction->t_updates, 0); + atomic_set(&transaction->t_outstanding_credits, 0); + atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -83,65 +88,75 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) * transaction's buffer credits. */ -static int start_this_handle(journal_t *journal, handle_t *handle) +static int start_this_handle(journal_t *journal, handle_t *handle, + int gfp_mask) { transaction_t *transaction; int needed; int nblocks = handle->h_buffer_credits; transaction_t *new_transaction = NULL; - int ret = 0; unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", current->comm, nblocks, journal->j_max_transaction_buffers); - ret = -ENOSPC; - goto out; + return -ENOSPC; } alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); if (!new_transaction) { - ret = -ENOMEM; - goto out; + /* + * If __GFP_FS is not present, then we may be + * being called from inside the fs writeback + * layer, so we MUST NOT fail. Since + * __GFP_NOFAIL is going away, we will arrange + * to retry the allocation ourselves. + */ + if ((gfp_mask & __GFP_FS) == 0) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; + } + return -ENOMEM; } } jbd_debug(3, "New handle %p going live.\n", handle); -repeat: - /* * We need to hold j_state_lock until t_updates has been incremented, * for proper journal barrier handling */ - spin_lock(&journal->j_state_lock); -repeat_locked: +repeat: + read_lock(&journal->j_state_lock); if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { - spin_unlock(&journal->j_state_lock); - ret = -EROFS; - goto out; + read_unlock(&journal->j_state_lock); + kfree(new_transaction); + return -EROFS; } /* Wait on the journal's transaction barrier if necessary */ if (journal->j_barrier_count) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_transaction_locked, journal->j_barrier_count == 0); goto repeat; } if (!journal->j_running_transaction) { - if (!new_transaction) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); + if (!new_transaction) goto alloc_transaction; + write_lock(&journal->j_state_lock); + if (!journal->j_running_transaction) { + jbd2_get_transaction(journal, new_transaction); + new_transaction = NULL; } - jbd2_get_transaction(journal, new_transaction); - new_transaction = NULL; + write_unlock(&journal->j_state_lock); + goto repeat; } transaction = journal->j_running_transaction; @@ -155,7 +170,7 @@ repeat_locked: prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -166,8 +181,8 @@ repeat_locked: * buffers requested by this operation, we need to stall pending a log * checkpoint to free some more log space. */ - spin_lock(&transaction->t_handle_lock); - needed = transaction->t_outstanding_credits + nblocks; + needed = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); if (needed > journal->j_max_transaction_buffers) { /* @@ -178,11 +193,11 @@ repeat_locked: DEFINE_WAIT(wait); jbd_debug(2, "Handle %p starting new commit...\n", handle); - spin_unlock(&transaction->t_handle_lock); + atomic_sub(nblocks, &transaction->t_outstanding_credits); prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -215,35 +230,48 @@ repeat_locked: */ if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); - spin_unlock(&transaction->t_handle_lock); - __jbd2_log_wait_for_space(journal); - goto repeat_locked; + atomic_sub(nblocks, &transaction->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); + if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + goto repeat; } /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. */ - - if (time_after(transaction->t_start, ts)) { + * use and add the handle to the running transaction. + * + * In order for t_max_wait to be reliable, it must be + * protected by a lock. But doing so will mean that + * start_this_handle() can not be run in parallel on SMP + * systems, which limits our scalability. So we only enable + * it when debugging is enabled. We may want to use a + * separate flag, eventually, so we can enable this + * independently of debugging. + */ +#ifdef CONFIG_JBD2_DEBUG + if (jbd2_journal_enable_debug && + time_after(transaction->t_start, ts)) { ts = jbd2_time_diff(ts, transaction->t_start); + spin_lock(&transaction->t_handle_lock); if (ts > transaction->t_max_wait) transaction->t_max_wait = ts; + spin_unlock(&transaction->t_handle_lock); } - +#endif handle->h_transaction = transaction; - transaction->t_outstanding_credits += nblocks; - transaction->t_updates++; - transaction->t_handle_count++; + atomic_inc(&transaction->t_updates); + atomic_inc(&transaction->t_handle_count); jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", - handle, nblocks, transaction->t_outstanding_credits, + handle, nblocks, + atomic_read(&transaction->t_outstanding_credits), __jbd2_log_space_left(journal)); - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); -out: - if (unlikely(new_transaction)) /* It's usually NULL */ - kfree(new_transaction); - return ret; + kfree(new_transaction); + return 0; } static struct lock_class_key jbd2_handle_key; @@ -278,7 +306,7 @@ static handle_t *new_handle(int nblocks) * * Return a pointer to a newly allocated handle, or NULL on failure */ -handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) { handle_t *handle = journal_current_handle(); int err; @@ -298,7 +326,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) current->journal_info = handle; - err = start_this_handle(journal, handle); + err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { jbd2_free_handle(handle); current->journal_info = NULL; @@ -308,6 +336,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) out: return handle; } +EXPORT_SYMBOL(jbd2__journal_start); + + +handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +{ + return jbd2__journal_start(journal, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_start); + /** * int jbd2_journal_extend() - extend buffer credits. @@ -342,7 +379,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) result = 1; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); /* Don't extend a locked-down transaction! */ if (handle->h_transaction->t_state != T_RUNNING) { @@ -352,7 +389,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) } spin_lock(&transaction->t_handle_lock); - wanted = transaction->t_outstanding_credits + nblocks; + wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; if (wanted > journal->j_max_transaction_buffers) { jbd_debug(3, "denied handle %p %d blocks: " @@ -367,14 +404,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) } handle->h_buffer_credits += nblocks; - transaction->t_outstanding_credits += nblocks; + atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); unlock: spin_unlock(&transaction->t_handle_lock); error_out: - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); out: return result; } @@ -394,8 +431,7 @@ out: * transaction capabable of guaranteeing the requested number of * credits. */ - -int jbd2_journal_restart(handle_t *handle, int nblocks) +int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -410,29 +446,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) * First unlink the handle from its current transaction, and start the * commit on that. */ - J_ASSERT(transaction->t_updates > 0); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); J_ASSERT(journal_current_handle() == handle); - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - - if (!transaction->t_updates) + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); + if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "restarting handle %p\n", handle); __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; - ret = start_this_handle(journal, handle); + ret = start_this_handle(journal, handle, gfp_mask); return ret; } +EXPORT_SYMBOL(jbd2__journal_restart); +int jbd2_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2__journal_restart(handle, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_restart); + /** * void jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. @@ -447,7 +489,7 @@ void jbd2_journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -458,19 +500,19 @@ void jbd2_journal_lock_updates(journal_t *journal) break; spin_lock(&transaction->t_handle_lock); - if (!transaction->t_updates) { + if (!atomic_read(&transaction->t_updates)) { spin_unlock(&transaction->t_handle_lock); break; } prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_updates, &wait); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* * We have now established a barrier against other normal updates, but @@ -494,9 +536,9 @@ void jbd2_journal_unlock_updates (journal_t *journal) J_ASSERT(journal->j_barrier_count != 0); mutex_unlock(&journal->j_barrier); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); --journal->j_barrier_count; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_transaction_locked); } @@ -725,6 +767,9 @@ done: page = jh2bh(jh)->b_page; offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; source = kmap_atomic(page, KM_USER0); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, + jh->b_triggers); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); @@ -963,15 +1008,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, jh->b_triggers = type; } -void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers) { struct buffer_head *bh = jh2bh(jh); - if (!triggers || !triggers->t_commit) + if (!triggers || !triggers->t_frozen) return; - triggers->t_commit(triggers, bh, mapped_data, bh->b_size); + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); } void jbd2_buffer_abort_trigger(struct journal_head *jh, @@ -1235,7 +1280,8 @@ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int err; + int err, wait_for_commit = 0; + tid_t tid; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1243,7 +1289,7 @@ int jbd2_journal_stop(handle_t *handle) if (is_handle_aborted(handle)) err = -EIO; else { - J_ASSERT(transaction->t_updates > 0); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); err = 0; } @@ -1288,9 +1334,9 @@ int jbd2_journal_stop(handle_t *handle) journal->j_last_sync_writer = pid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); commit_time = journal->j_average_commit_time; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); trans_time = ktime_to_ns(ktime_sub(ktime_get(), transaction->t_start_time)); @@ -1311,14 +1357,8 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; current->journal_info = NULL; - spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - if (!transaction->t_updates) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); /* * If the handle is marked SYNC, we need to set another commit @@ -1327,15 +1367,13 @@ int jbd2_journal_stop(handle_t *handle) * transaction is too old now. */ if (handle->h_sync || - transaction->t_outstanding_credits > - journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { + (atomic_read(&transaction->t_outstanding_credits) > + journal->j_max_transaction_buffers) || + time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ - tid_t tid = transaction->t_tid; - spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ @@ -1346,11 +1384,25 @@ int jbd2_journal_stop(handle_t *handle) * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) - err = jbd2_log_wait_commit(journal, tid); - } else { - spin_unlock(&transaction->t_handle_lock); + wait_for_commit = 1; } + /* + * Once we drop t_updates, if it goes to zero the transaction + * could start commiting on us and eventually disappear. So + * once we do this, we must not dereference transaction + * pointer again. + */ + tid = transaction->t_tid; + if (atomic_dec_and_test(&transaction->t_updates)) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + if (wait_for_commit) + err = jbd2_log_wait_commit(journal, tid); + lock_map_release(&handle->h_lockdep_map); jbd2_free_handle(handle); @@ -1716,7 +1768,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1769,7 +1821,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } else { /* There is no currently-running transaction. So the @@ -1783,7 +1835,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } else { /* The orphan record's transaction has @@ -1807,7 +1859,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -1826,7 +1878,7 @@ zap_buffer: zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); @@ -2133,9 +2185,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal, /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started * a transaction adding the inode to orphan list */ - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); inode_trans = jinode->i_transaction; spin_unlock(&journal->j_list_lock); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index a2d58c9..d258e26 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) { - /* success of check_xattr_ref_inode() means taht inode (ic) dose not have + /* success of check_xattr_ref_inode() means that inode (ic) dose not have * duplicate name/value pairs. If duplicate name/value pair would be found, * one will be removed. */ diff --git a/fs/mbcache.c b/fs/mbcache.c index ec88ff3..e28f21b 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache) * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -191,13 +191,14 @@ forget: * This function is called by the kernel memory management when memory * gets low. * + * @shrink: (ignored) * @nr_to_scan: Number of objects to scan * @gfp_mask: (ignored) * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free_list); struct list_head *l, *ltmp; @@ -282,8 +282,7 @@ int inode_permission(struct inode *inode, int mask) if (retval) return retval; - return security_inode_permission(inode, - mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND)); + return security_inode_permission(inode, mask); } /** @@ -1484,8 +1483,7 @@ static int handle_truncate(struct path *path) */ error = locks_verify_locked(inode); if (!error) - error = security_path_truncate(path, 0, - ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); + error = security_path_truncate(path); if (!error) { error = do_truncate(path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index fa33851..1e634de 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -728,8 +728,8 @@ out_fput: out_bdi: /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: * - * The previously used put_filp(ncp_filp); was bogous, since - * it doesn't proper unlocking. + * The previously used put_filp(ncp_filp); was bogus, since + * it doesn't perform proper unlocking. */ fput(ncp_filp); out: diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index a43d07e..cc1bb33 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -61,8 +61,8 @@ config NFS_V3_ACL If unsure, say N. config NFS_V4 - bool "NFS client support for NFS version 4 (EXPERIMENTAL)" - depends on NFS_FS && EXPERIMENTAL + bool "NFS client support for NFS version 4" + depends on NFS_FS select RPCSEC_GSS_KRB5 help This option enables support for version 4 of the NFS protocol @@ -72,16 +72,16 @@ config NFS_V4 space programs which can be found in the Linux nfs-utils package, available from http://linux-nfs.org/. - If unsure, say N. + If unsure, say Y. config NFS_V4_1 - bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)" + bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" depends on NFS_V4 && EXPERIMENTAL help This option enables support for minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. - Unless you're an NFS developer, say N. + If unsure, say N. config ROOT_NFS bool "Root file system on NFS" diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index a08770a..930d10f 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * if (inode == NULL) goto out_putclient; nfsi = NFS_I(inode); - down_read(&nfsi->rwsem); - delegation = nfsi->delegation; + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) goto out_iput; res->size = i_size_read(inode); @@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * args->bitmap[1]; res->status = 0; out_iput: - up_read(&nfsi->rwsem); + rcu_read_unlock(); iput(inode); out_putclient: nfs_put_client(clp); @@ -62,16 +62,6 @@ out: return res->status; } -static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) -{ -#if defined(CONFIG_NFS_V4_1) - if (clp->cl_minorversion > 0) - return nfs41_validate_delegation_stateid; -#endif - return nfs4_validate_delegation_stateid; -} - - __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) { struct nfs_client *clp; @@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) inode = nfs_delegation_find_inode(clp, &args->fh); if (inode != NULL) { /* Set up a helper thread to actually return the delegation */ - switch (nfs_async_inode_return_delegation(inode, &args->stateid, - nfs_validate_delegation_stateid(clp))) { + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: res = 0; break; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d25b525..4e7df2a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_boot_time = CURRENT_TIME; clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; #endif cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) @@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp) clp->cl_session = NULL; } - clp->cl_call_sync = _nfs4_call_sync; + clp->cl_mvops = nfs_v4_minor_ops[0]; #endif /* CONFIG_NFS_V4_1 */ } @@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp) static void nfs4_destroy_callback(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_minorversion); + nfs_callback_down(clp->cl_mvops->minor_version); } static void nfs4_shutdown_client(struct nfs_client *clp) @@ -1126,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp) return error; } - error = nfs_callback_up(clp->cl_minorversion, + error = nfs_callback_up(clp->cl_mvops->minor_version, clp->cl_rpcclient->cl_xprt); if (error < 0) { dprintk("%s: failed to start callback. Error = %d\n", @@ -1143,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp) */ static int nfs4_init_client_minor_version(struct nfs_client *clp) { - clp->cl_call_sync = _nfs4_call_sync; - #if defined(CONFIG_NFS_V4_1) - if (clp->cl_minorversion) { + if (clp->cl_mvops->minor_version) { struct nfs4_session *session = NULL; /* * Create the session and mark it expired. @@ -1158,7 +1157,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) return -ENOMEM; clp->cl_session = session; - clp->cl_call_sync = _nfs4_call_sync_session; + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + clp->cl_cons_state = NFS_CS_SESSION_INITING; } #endif /* CONFIG_NFS_V4_1 */ @@ -1454,7 +1459,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, data->authflavor, parent_server->client->cl_xprt->prot, parent_server->client->cl_timeout, - parent_client->cl_minorversion); + parent_client->cl_mvops->minor_version); if (error < 0) goto error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 3016345..b9c3c43 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -268,14 +268,6 @@ out: return status; } -/* Sync all data to disk upon delegation return */ -static void nfs_msync_inode(struct inode *inode) -{ - filemap_fdatawrite(inode->i_mapping); - nfs_wb_all(inode); - filemap_fdatawait(inode->i_mapping); -} - /* * Basic procedure for returning a delegation to the server */ @@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode) delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); spin_unlock(&clp->cl_lock); if (delegation != NULL) { - nfs_msync_inode(inode); + nfs_wb_all(inode); err = __nfs_inode_return_delegation(inode, delegation, 1); } } @@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) /* * Asynchronous delegation recall! */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, - int (*validate_stateid)(struct nfs_delegation *delegation, - const nfs4_stateid *stateid)) +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_delegation *delegation; @@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (!validate_stateid(delegation, stateid)) { + if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { rcu_read_unlock(); return -ENOENT; } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 69e7b814..2026304 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); int nfs_inode_return_delegation(struct inode *inode); -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, - int (*validate_stateid)(struct nfs_delegation *delegation, - const nfs4_stateid *stateid)); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 782b431..29539ce 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1652,16 +1652,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - /* - * ... prune child dentries and writebacks if needed. - */ - if (atomic_read(&old_dentry->d_count) > 1) { - if (S_ISREG(old_inode->i_mode)) - nfs_wb_all(old_inode); - shrink_dcache_parent(old_dentry); - } nfs_inode_return_delegation(old_inode); - if (new_inode != NULL) nfs_inode_return_delegation(new_inode); @@ -1710,7 +1701,7 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); struct nfs_inode *nfsi; @@ -1953,7 +1944,7 @@ int nfs_permission(struct inode *inode, int mask) if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) goto out; /* Is this sys_access() ? */ - if (mask & MAY_ACCESS) + if (mask & (MAY_ACCESS | MAY_CHDIR)) goto force_lookup; switch (inode->i_mode & S_IFMT) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index ad4cd31..064a809 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -69,6 +69,7 @@ struct nfs_direct_req { /* I/O parameters */ struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ struct kiocb * iocb; /* controlling i/o request */ struct inode * inode; /* target file of i/o */ @@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) INIT_LIST_HEAD(&dreq->rewrite_list); dreq->iocb = NULL; dreq->ctx = NULL; + dreq->l_ctx = NULL; spin_lock_init(&dreq->lock); atomic_set(&dreq->io_count, 0); dreq->count = 0; @@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + if (dreq->l_ctx != NULL) + nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) put_nfs_open_context(dreq->ctx); kmem_cache_free(nfs_direct_cachep, dreq); @@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - ssize_t result = 0; + ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; dreq = nfs_direct_req_alloc(); - if (!dreq) - return -ENOMEM; + if (dreq == NULL) + goto out; dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx == NULL) + goto out_release; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); if (!result) result = nfs_direct_wait(dreq); +out_release: nfs_direct_req_release(dreq); - +out: return result; } @@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.offset = 0; data->args.count = 0; data->args.context = dreq->ctx; + data->args.lock_context = dreq->l_ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, size_t count) { - ssize_t result = 0; + ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; size_t wsize = NFS_SERVER(inode)->wsize; @@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq = nfs_direct_req_alloc(); if (!dreq) - return -ENOMEM; + goto out; nfs_alloc_commit_data(dreq); if (dreq->commit_data == NULL || count < wsize) @@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx != NULL) + goto out_release; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); if (!result) result = nfs_direct_wait(dreq); +out_release: nfs_direct_req_release(dreq); - +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 36a5e74..2d141a7 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -27,6 +27,7 @@ #include <linux/pagemap.h> #include <linux/aio.h> #include <linux/gfp.h> +#include <linux/swap.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -202,37 +203,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) } /* - * Helper for nfs_file_flush() and nfs_file_fsync() - * - * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to - * disk, but it retrieves and clears ctx->error after synching, despite - * the two being set at the same time in nfs_context_set_write_error(). - * This is because the former is used to notify the _next_ call to - * nfs_file_write() that a write error occured, and hence cause it to - * fall back to doing a synchronous write. - */ -static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) -{ - int have_error, status; - int ret = 0; - - have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - status = nfs_wb_all(inode); - have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) - ret = xchg(&ctx->error, 0); - if (!ret) - ret = status; - return ret; -} - -/* * Flush all dirty pages, and check for write errors. */ static int nfs_file_flush(struct file *file, fl_owner_t id) { - struct nfs_open_context *ctx = nfs_file_open_context(file); struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; @@ -245,7 +220,7 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; /* Flush writes to the server and return any errors */ - return nfs_do_fsync(ctx, inode); + return vfs_fsync(file, 0); } static ssize_t @@ -320,6 +295,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) * Flush any dirty pages for this process, and check for write errors. * The return status from this call provides a reliable indication of * whether any write errors occurred for this process. + * + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to + * disk, but it retrieves and clears ctx->error after synching, despite + * the two being set at the same time in nfs_context_set_write_error(). + * This is because the former is used to notify the _next_ call to + * nfs_file_write() that a write error occured, and hence cause it to + * fall back to doing a synchronous write. */ static int nfs_file_fsync(struct file *file, int datasync) @@ -327,13 +309,23 @@ nfs_file_fsync(struct file *file, int datasync) struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; + int have_error, status; + int ret = 0; + dprintk("NFS: fsync file(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - return nfs_do_fsync(ctx, inode); + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + status = nfs_commit_inode(inode, FLUSH_SYNC); + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (have_error) + ret = xchg(&ctx->error, 0); + if (!ret) + ret = status; + return ret; } /* @@ -493,11 +485,19 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) */ static int nfs_release_page(struct page *page, gfp_t gfp) { + struct address_space *mapping = page->mapping; + dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); /* Only do I/O if gfp is a superset of GFP_KERNEL */ - if ((gfp & GFP_KERNEL) == GFP_KERNEL) - nfs_wb_page(page->mapping->host, page); + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + int how = FLUSH_SYNC; + + /* Don't let kswapd deadlock waiting for OOM RPC calls */ + if (current_is_kswapd()) + how = 0; + nfs_commit_inode(mapping->host, how); + } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; @@ -639,7 +639,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, /* Return error values for O_DSYNC and IS_SYNC() */ if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); + int err = vfs_fsync(iocb->ki_filp, 0); if (err < 0) result = err; } @@ -675,7 +675,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, written = ret; if (ret >= 0 && nfs_need_sync_write(filp, inode)) { - int err = nfs_do_fsync(nfs_file_open_context(filp), inode); + int err = vfs_fsync(filp, 0); if (err < 0) ret = err; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 099b351..581d8f0 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -413,10 +413,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) return 0; /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) { - filemap_write_and_wait(inode->i_mapping); + if (S_ISREG(inode->i_mode)) nfs_wb_all(inode); - } fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -530,6 +528,68 @@ out: return err; } +static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) +{ + atomic_set(&l_ctx->count, 1); + l_ctx->lockowner = current->files; + l_ctx->pid = current->tgid; + INIT_LIST_HEAD(&l_ctx->list); +} + +static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *pos; + + list_for_each_entry(pos, &ctx->lock_context.list, list) { + if (pos->lockowner != current->files) + continue; + if (pos->pid != current->tgid) + continue; + atomic_inc(&pos->count); + return pos; + } + return NULL; +} + +struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *res, *new = NULL; + struct inode *inode = ctx->path.dentry->d_inode; + + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + spin_unlock(&inode->i_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + nfs_init_lock_context(new); + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + list_add_tail(&new->list, &ctx->lock_context.list); + new->open_context = ctx; + res = new; + new = NULL; + } + } + spin_unlock(&inode->i_lock); + kfree(new); + return res; +} + +void nfs_put_lock_context(struct nfs_lock_context *l_ctx) +{ + struct nfs_open_context *ctx = l_ctx->open_context; + struct inode *inode = ctx->path.dentry->d_inode; + + if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + return; + list_del(&l_ctx->list); + spin_unlock(&inode->i_lock); + kfree(l_ctx); +} + /** * nfs_close_context - Common close_context() routine NFSv2/v3 * @ctx: pointer to context @@ -566,11 +626,11 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct path_get(&ctx->path); ctx->cred = get_rpccred(cred); ctx->state = NULL; - ctx->lockowner = current->files; ctx->flags = 0; ctx->error = 0; ctx->dir_cookie = 0; - atomic_set(&ctx->count, 1); + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; } return ctx; } @@ -578,7 +638,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL) - atomic_inc(&ctx->count); + atomic_inc(&ctx->lock_context.count); return ctx; } @@ -586,7 +646,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = ctx->path.dentry->d_inode; - if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d8bd619..4c2150d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[]; void nfs_close_context(struct nfs_open_context *ctx, int is_sync); /* dir.c */ -extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); +extern int nfs_access_cache_shrinker(struct shrinker *shrink, + int nr_to_scan, gfp_t gfp_mask); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; @@ -369,10 +370,9 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) * Helper for restarting RPC calls in the possible presence of NFSv4.1 * sessions. */ -static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) +static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) { if (nfs4_has_session(clp)) - rpc_restart_call_prepare(task); - else - rpc_restart_call(task); + return rpc_restart_call_prepare(task); + return rpc_restart_call(task); } diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 81cf142..db8846a 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs static int nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; u32 offset = (u32)args->offset; u32 count = args->count; @@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg static int nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) { - struct rpc_task *task = req->rq_task; - struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -575,7 +574,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) static int nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 75dcfc7..9769704 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -330,7 +330,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg static int nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -471,7 +471,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args) static int nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -675,7 +675,7 @@ static int nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, struct nfs3_getaclargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); @@ -802,7 +802,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) static int nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c538c61..311e15c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -45,10 +45,29 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, NFS4CLNT_SESSION_RESET, - NFS4CLNT_SESSION_DRAINING, NFS4CLNT_RECALL_SLOT, }; +enum nfs4_session_state { + NFS4_SESSION_INITING, + NFS4_SESSION_DRAINING, +}; + +struct nfs4_minor_version_ops { + u32 minor_version; + + int (*call_sync)(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply); + int (*validate_stateid)(struct nfs_delegation *, + const nfs4_stateid *); + const struct nfs4_state_recovery_ops *reboot_recovery_ops; + const struct nfs4_state_recovery_ops *nograce_recovery_ops; + const struct nfs4_state_maintenance_ops *state_renewal_ops; +}; + /* * struct rpc_sequence ensures that RPC calls are sent in the exact * order that they appear on the list. @@ -89,7 +108,6 @@ struct nfs_unique_id { */ struct nfs4_state_owner { struct nfs_unique_id so_owner_id; - struct nfs_client *so_client; struct nfs_server *so_server; struct rb_node so_client_node; @@ -99,7 +117,6 @@ struct nfs4_state_owner { atomic_t so_count; unsigned long so_flags; struct list_head so_states; - struct list_head so_delegations; struct nfs_seqid_counter so_seqid; struct rpc_sequence so_sequence; }; @@ -125,10 +142,20 @@ enum { * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) */ +struct nfs4_lock_owner { + unsigned int lo_type; +#define NFS4_ANY_LOCK_TYPE (0U) +#define NFS4_FLOCK_LOCK_TYPE (1U << 0) +#define NFS4_POSIX_LOCK_TYPE (1U << 1) + union { + fl_owner_t posix_owner; + pid_t flock_owner; + } lo_u; +}; + struct nfs4_lock_state { struct list_head ls_locks; /* Other lock stateids */ struct nfs4_state * ls_state; /* Pointer to open state */ - fl_owner_t ls_owner; /* POSIX lock owner */ #define NFS_LOCK_INITIALIZED 1 int ls_flags; struct nfs_seqid_counter ls_seqid; @@ -136,6 +163,7 @@ struct nfs4_lock_state { struct nfs_unique_id ls_id; nfs4_stateid ls_stateid; atomic_t ls_count; + struct nfs4_lock_owner ls_owner; }; /* bits for nfs4_state->flags */ @@ -219,11 +247,15 @@ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nam extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); +extern void nfs4_release_lockowner(const struct nfs4_lock_state *); -extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; -extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; #if defined(CONFIG_NFS_V4_1) -extern int nfs4_setup_sequence(struct nfs_client *clp, +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return server->nfs_client->cl_session; +} + +extern int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); @@ -234,7 +266,12 @@ extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); #else /* CONFIG_NFS_v4_1 */ -static inline int nfs4_setup_sequence(struct nfs_client *clp, +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return NULL; +} + +static inline int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task) { @@ -247,7 +284,7 @@ static inline int nfs4_init_session(struct nfs_server *server) } #endif /* CONFIG_NFS_V4_1 */ -extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; +extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; extern const u32 nfs4_fattr_bitmap[2]; extern const u32 nfs4_statfs_bitmap[2]; @@ -284,7 +321,7 @@ extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); +extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 70015dd..7ffbb98 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -303,15 +303,19 @@ do_state_recovery: } -static void renew_lease(const struct nfs_server *server, unsigned long timestamp) +static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) { - struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); if (time_before(clp->cl_last_renewal,timestamp)) clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); } +static void renew_lease(const struct nfs_server *server, unsigned long timestamp) +{ + do_renew_lease(server->nfs_client, timestamp); +} + #if defined(CONFIG_NFS_V4_1) /* @@ -356,7 +360,7 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses) { struct rpc_task *task; - if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { + if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); if (task) rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); @@ -370,12 +374,11 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses) complete(&ses->complete); } -static void nfs41_sequence_free_slot(const struct nfs_client *clp, - struct nfs4_sequence_res *res) +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot_table *tbl; - tbl = &clp->cl_session->fc_slot_table; + tbl = &res->sr_session->fc_slot_table; if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { /* just wake up the next guy waiting since * we may have not consumed a slot after all */ @@ -385,18 +388,17 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp, spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slotid); - nfs41_check_drain_session_complete(clp->cl_session); + nfs41_check_drain_session_complete(res->sr_session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slotid = NFS4_MAX_SLOT_TABLE; } -static void nfs41_sequence_done(struct nfs_client *clp, - struct nfs4_sequence_res *res, - int rpc_status) +static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { unsigned long timestamp; struct nfs4_slot_table *tbl; struct nfs4_slot *slot; + struct nfs_client *clp; /* * sr_status remains 1 if an RPC level error occurred. The server @@ -411,25 +413,51 @@ static void nfs41_sequence_done(struct nfs_client *clp, if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) goto out; + tbl = &res->sr_session->fc_slot_table; + slot = tbl->slots + res->sr_slotid; + /* Check the SEQUENCE operation status */ - if (res->sr_status == 0) { - tbl = &clp->cl_session->fc_slot_table; - slot = tbl->slots + res->sr_slotid; + switch (res->sr_status) { + case 0: /* Update the slot's sequence and clientid lease timer */ ++slot->seq_nr; timestamp = res->sr_renewal_time; - spin_lock(&clp->cl_lock); - if (time_before(clp->cl_last_renewal, timestamp)) - clp->cl_last_renewal = timestamp; - spin_unlock(&clp->cl_lock); + clp = res->sr_session->clp; + do_renew_lease(clp, timestamp); /* Check sequence flags */ if (atomic_read(&clp->cl_count) > 1) nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + break; + case -NFS4ERR_DELAY: + /* The server detected a resend of the RPC call and + * returned NFS4ERR_DELAY as per Section 2.10.6.2 + * of RFC5661. + */ + dprintk("%s: slot=%d seq=%d: Operation in progress\n", + __func__, res->sr_slotid, slot->seq_nr); + goto out_retry; + default: + /* Just update the slot sequence no. */ + ++slot->seq_nr; } out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); - nfs41_sequence_free_slot(clp, res); + nfs41_sequence_free_slot(res); + return 1; +out_retry: + if (!rpc_restart_call(task)) + goto out; + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return 0; +} + +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_session == NULL) + return 1; + return nfs41_sequence_done(task, res); } /* @@ -480,12 +508,11 @@ static int nfs41_setup_sequence(struct nfs4_session *session, if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) return 0; - memset(res, 0, sizeof(*res)); res->sr_slotid = NFS4_MAX_SLOT_TABLE; tbl = &session->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && + if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { /* * The state manager will wait until the slot table is empty. @@ -525,6 +552,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, res->sr_session = session; res->sr_slotid = slotid; res->sr_renewal_time = jiffies; + res->sr_status_flags = 0; /* * sr_status is only set in decode_sequence, and so will remain * set to 1 if an rpc level failure occurs. @@ -533,33 +561,33 @@ static int nfs41_setup_sequence(struct nfs4_session *session, return 0; } -int nfs4_setup_sequence(struct nfs_client *clp, +int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task) { + struct nfs4_session *session = nfs4_get_session(server); int ret = 0; + if (session == NULL) { + args->sa_session = NULL; + res->sr_session = NULL; + goto out; + } + dprintk("--> %s clp %p session %p sr_slotid %d\n", - __func__, clp, clp->cl_session, res->sr_slotid); + __func__, session->clp, session, res->sr_slotid); - if (!nfs4_has_session(clp)) - goto out; - ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, + ret = nfs41_setup_sequence(session, args, res, cache_reply, task); - if (ret && ret != -EAGAIN) { - /* terminate rpc task */ - task->tk_status = ret; - task->tk_action = NULL; - } out: dprintk("<-- %s status=%d\n", __func__, ret); return ret; } struct nfs41_call_sync_data { - struct nfs_client *clp; + const struct nfs_server *seq_server; struct nfs4_sequence_args *seq_args; struct nfs4_sequence_res *seq_res; int cache_reply; @@ -569,9 +597,9 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; - dprintk("--> %s data->clp->cl_session %p\n", __func__, - data->clp->cl_session); - if (nfs4_setup_sequence(data->clp, data->seq_args, + dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); + + if (nfs4_setup_sequence(data->seq_server, data->seq_args, data->seq_res, data->cache_reply, task)) return; rpc_call_start(task); @@ -587,7 +615,7 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; - nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); + nfs41_sequence_done(task, data->seq_res); } struct rpc_call_ops nfs41_call_sync_ops = { @@ -600,8 +628,7 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static int nfs4_call_sync_sequence(struct nfs_client *clp, - struct rpc_clnt *clnt, +static int nfs4_call_sync_sequence(struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -611,13 +638,13 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp, int ret; struct rpc_task *task; struct nfs41_call_sync_data data = { - .clp = clp, + .seq_server = server, .seq_args = args, .seq_res = res, .cache_reply = cache_reply, }; struct rpc_task_setup task_setup = { - .rpc_client = clnt, + .rpc_client = server->client, .rpc_message = msg, .callback_ops = &nfs41_call_sync_ops, .callback_data = &data @@ -642,10 +669,15 @@ int _nfs4_call_sync_session(struct nfs_server *server, struct nfs4_sequence_res *res, int cache_reply) { - return nfs4_call_sync_sequence(server->nfs_client, server->client, - msg, args, res, cache_reply, 0); + return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); } +#else +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + return 1; +} #endif /* CONFIG_NFS_V4_1 */ int _nfs4_call_sync(struct nfs_server *server, @@ -659,18 +691,9 @@ int _nfs4_call_sync(struct nfs_server *server, } #define nfs4_call_sync(server, msg, args, res, cache_reply) \ - (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ + (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ &(res)->seq_res, (cache_reply)) -static void nfs4_sequence_done(const struct nfs_server *server, - struct nfs4_sequence_res *res, int rpc_status) -{ -#ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(server->nfs_client)) - nfs41_sequence_done(server->nfs_client, res, rpc_status); -#endif /* CONFIG_NFS_V4_1 */ -} - static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { struct nfs_inode *nfsi = NFS_I(dir); @@ -745,19 +768,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; - if (flags & O_EXCL) { - if (nfs4_has_persistent_session(server->nfs_client)) { - /* GUARDED */ - p->o_arg.u.attrs = &p->attrs; - memcpy(&p->attrs, attrs, sizeof(p->attrs)); - } else { /* EXCLUSIVE4_1 */ - u32 *s = (u32 *) p->o_arg.u.verifier.data; - s[0] = jiffies; - s[1] = current->pid; - } - } else if (flags & O_CREAT) { + if (flags & O_CREAT) { + u32 *s; + p->o_arg.u.attrs = &p->attrs; memcpy(&p->attrs, attrs, sizeof(p->attrs)); + s = (u32 *) p->o_arg.u.verifier.data; + s[0] = jiffies; + s[1] = current->pid; } p->c_arg.fh = &p->o_res.fh; p->c_arg.stateid = &p->o_res.stateid; @@ -1255,8 +1273,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) struct nfs4_opendata *data = calldata; data->rpc_status = task->tk_status; - if (RPC_ASSASSINATED(task)) - return; if (data->rpc_status == 0) { memcpy(data->o_res.stateid.data, data->c_res.stateid.data, sizeof(data->o_res.stateid.data)); @@ -1356,13 +1372,13 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) } /* Update sequence id. */ data->o_arg.id = sp->so_owner_id.id; - data->o_arg.clientid = sp->so_client->cl_clientid; + data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; - if (nfs4_setup_sequence(data->o_arg.server->nfs_client, + if (nfs4_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, &data->o_res.seq_res, 1, task)) return; @@ -1385,11 +1401,9 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; - nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, - task->tk_status); - - if (RPC_ASSASSINATED(task)) + if (!nfs4_sequence_done(task, &data->o_res.seq_res)) return; + if (task->tk_status == 0) { switch (data->o_res.f_attr->mode & S_IFMT) { case S_IFREG: @@ -1773,7 +1787,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { /* Use that stateid */ } else if (state != NULL) { - nfs4_copy_stateid(&arg.stateid, state, current->files); + nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); @@ -1838,8 +1852,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); - nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); - if (RPC_ASSASSINATED(task)) + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors @@ -1903,7 +1916,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; - if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, + if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), &calldata->arg.seq_args, &calldata->res.seq_res, 1, task)) return; @@ -2648,7 +2661,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) { struct nfs_removeres *res = task->tk_msg.rpc_resp; - nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); @@ -3093,7 +3107,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) dprintk("--> %s\n", __func__); - nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, server->nfs_client); @@ -3116,8 +3131,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); @@ -3145,8 +3160,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; @@ -3196,10 +3212,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) nfs4_schedule_state_recovery(clp); return; } - spin_lock(&clp->cl_lock); - if (time_before(clp->cl_last_renewal,timestamp)) - clp->cl_last_renewal = timestamp; - spin_unlock(&clp->cl_lock); + do_renew_lease(clp, timestamp); } static const struct rpc_call_ops nfs4_renew_ops = { @@ -3240,10 +3253,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); if (status < 0) return status; - spin_lock(&clp->cl_lock); - if (time_before(clp->cl_last_renewal,now)) - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); + do_renew_lease(clp, now); return 0; } @@ -3464,9 +3474,11 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen } static int -_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) { - if (!clp || task->tk_status >= 0) + struct nfs_client *clp = server->nfs_client; + + if (task->tk_status >= 0) return 0; switch(task->tk_status) { case -NFS4ERR_ADMIN_REVOKED: @@ -3498,8 +3510,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: - if (server) - nfs_inc_server_stats(server, NFSIOS_DELAY); + nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); @@ -3520,12 +3531,6 @@ do_state_recovery: return -EAGAIN; } -static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) -{ - return _nfs4_async_handle_error(task, server, server->nfs_client, state); -} - int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred, struct nfs4_setclientid_res *res) @@ -3641,8 +3646,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; - nfs4_sequence_done(data->res.server, &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; switch (task->tk_status) { case -NFS4ERR_STALE_STATEID: @@ -3672,7 +3677,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (nfs4_setup_sequence(d_data->res.server->nfs_client, + if (nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, &d_data->res.seq_res, 1, task)) return; @@ -3892,9 +3897,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; - nfs4_sequence_done(calldata->server, &calldata->res.seq_res, - task->tk_status); - if (RPC_ASSASSINATED(task)) + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; switch (task->tk_status) { case 0: @@ -3927,7 +3930,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) return; } calldata->timestamp = jiffies; - if (nfs4_setup_sequence(calldata->server->nfs_client, + if (nfs4_setup_sequence(calldata->server, &calldata->arg.seq_args, &calldata->res.seq_res, 1, task)) return; @@ -4082,7 +4085,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) } else data->arg.new_lock_owner = 0; data->timestamp = jiffies; - if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, + if (nfs4_setup_sequence(data->server, + &data->arg.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); @@ -4101,12 +4105,10 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) dprintk("%s: begin!\n", __func__); - nfs4_sequence_done(data->server, &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; data->rpc_status = task->tk_status; - if (RPC_ASSASSINATED(task)) - goto out; if (data->arg.new_lock_owner != 0) { if (data->rpc_status == 0) nfs_confirm_seqid(&data->lsp->ls_seqid, 0); @@ -4424,6 +4426,34 @@ out: return err; } +static void nfs4_release_lockowner_release(void *calldata) +{ + kfree(calldata); +} + +const struct rpc_call_ops nfs4_release_lockowner_ops = { + .rpc_release = nfs4_release_lockowner_release, +}; + +void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) +{ + struct nfs_server *server = lsp->ls_state->owner->so_server; + struct nfs_release_lockowner_args *args; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], + }; + + if (server->nfs_client->cl_mvops->minor_version != 0) + return; + args = kmalloc(sizeof(*args), GFP_NOFS); + if (!args) + return; + args->lock_owner.clientid = server->nfs_client->cl_clientid; + args->lock_owner.id = lsp->ls_id.id; + msg.rpc_argp = args; + rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); +} + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, @@ -4611,7 +4641,8 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) (struct nfs4_get_lease_time_data *)calldata; dprintk("--> %s\n", __func__); - nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); + if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) + return; switch (task->tk_status) { case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: @@ -4805,13 +4836,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) if (!session) return NULL; - /* - * The create session reply races with the server back - * channel probe. Mark the client NFS_CS_SESSION_INITING - * so that the client back channel can find the - * nfs_client struct - */ - clp->cl_cons_state = NFS_CS_SESSION_INITING; init_completion(&session->complete); tbl = &session->fc_slot_table; @@ -4824,6 +4848,8 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) spin_lock_init(&tbl->slot_tbl_lock); rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + session->session_state = 1<<NFS4_SESSION_INITING; + session->clp = clp; return session; } @@ -5040,6 +5066,10 @@ int nfs4_init_session(struct nfs_server *server) if (!nfs4_has_session(clp)) return 0; + session = clp->cl_session; + if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) + return 0; + rsize = server->rsize; if (rsize == 0) rsize = NFS_MAX_FILE_IO_SIZE; @@ -5047,7 +5077,6 @@ int nfs4_init_session(struct nfs_server *server) if (wsize == 0) wsize = NFS_MAX_FILE_IO_SIZE; - session = clp->cl_session; session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; @@ -5060,69 +5089,70 @@ int nfs4_init_session(struct nfs_server *server) /* * Renew the cl_session lease. */ -static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) -{ +struct nfs4_sequence_data { + struct nfs_client *clp; struct nfs4_sequence_args args; struct nfs4_sequence_res res; - - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], - .rpc_argp = &args, - .rpc_resp = &res, - .rpc_cred = cred, - }; - - args.sa_cache_this = 0; - - return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, - &res, args.sa_cache_this, 1); -} +}; static void nfs41_sequence_release(void *data) { - struct nfs_client *clp = (struct nfs_client *)data; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; if (atomic_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); + kfree(calldata); +} + +static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case -NFS4ERR_DELAY: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_state_recovery(clp); + } + return 0; } static void nfs41_sequence_call_done(struct rpc_task *task, void *data) { - struct nfs_client *clp = (struct nfs_client *)data; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; - nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); + if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) + return; if (task->tk_status < 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); if (atomic_read(&clp->cl_count) == 1) goto out; - if (_nfs4_async_handle_error(task, NULL, clp, NULL) - == -EAGAIN) { - nfs_restart_rpc(task, clp); + if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); return; } } dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); out: - kfree(task->tk_msg.rpc_argp); - kfree(task->tk_msg.rpc_resp); - dprintk("<-- %s\n", __func__); } static void nfs41_sequence_prepare(struct rpc_task *task, void *data) { - struct nfs_client *clp; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; struct nfs4_sequence_args *args; struct nfs4_sequence_res *res; - clp = (struct nfs_client *)data; args = task->tk_msg.rpc_argp; res = task->tk_msg.rpc_resp; - if (nfs4_setup_sequence(clp, args, res, 0, task)) + if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) return; rpc_call_start(task); } @@ -5133,32 +5163,67 @@ static const struct rpc_call_ops nfs41_sequence_ops = { .rpc_release = nfs41_sequence_release, }; -static int nfs41_proc_async_sequence(struct nfs_client *clp, - struct rpc_cred *cred) +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) { - struct nfs4_sequence_args *args; - struct nfs4_sequence_res *res; + struct nfs4_sequence_data *calldata; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], .rpc_cred = cred, }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs41_sequence_ops, + .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, + }; if (!atomic_inc_not_zero(&clp->cl_count)) - return -EIO; - args = kzalloc(sizeof(*args), GFP_NOFS); - res = kzalloc(sizeof(*res), GFP_NOFS); - if (!args || !res) { - kfree(args); - kfree(res); + return ERR_PTR(-EIO); + calldata = kmalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) { nfs_put_client(clp); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } - res->sr_slotid = NFS4_MAX_SLOT_TABLE; - msg.rpc_argp = args; - msg.rpc_resp = res; + calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + calldata->clp = clp; + task_setup_data.callback_data = calldata; - return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, - &nfs41_sequence_ops, (void *)clp); + return rpc_run_task(&task_setup_data); +} + +static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret = 0; + + task = _nfs41_proc_sequence(clp, cred); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret; + + task = _nfs41_proc_sequence(clp, cred); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + ret = rpc_wait_for_completion_task(task); + if (!ret) + ret = task->tk_status; + rpc_put_task(task); +out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; } struct nfs4_reclaim_complete_data { @@ -5172,13 +5237,31 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) struct nfs4_reclaim_complete_data *calldata = data; rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, + if (nfs41_setup_sequence(calldata->clp->cl_session, + &calldata->arg.seq_args, &calldata->res.seq_res, 0, task)) return; rpc_call_start(task); } +static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case 0: + case -NFS4ERR_COMPLETE_ALREADY: + case -NFS4ERR_WRONG_CRED: /* What to do here? */ + break; + case -NFS4ERR_DELAY: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_state_recovery(clp); + } + return 0; +} + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) { struct nfs4_reclaim_complete_data *calldata = data; @@ -5186,32 +5269,13 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) struct nfs4_sequence_res *res = &calldata->res.seq_res; dprintk("--> %s\n", __func__); - nfs41_sequence_done(clp, res, task->tk_status); - switch (task->tk_status) { - case 0: - case -NFS4ERR_COMPLETE_ALREADY: - break; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_DEADSESSION: - /* - * Handle the session error, but do not retry the operation, as - * we have no way of telling whether the clientid had to be - * reset before we got our reply. If reset, a new wave of - * reclaim operations will follow, containing their own reclaim - * complete. We don't want our retry to get on the way of - * recovery by incorrectly indicating to the server that we're - * done reclaiming state since the process had to be restarted. - */ - _nfs4_async_handle_error(task, NULL, clp, NULL); - break; - default: - if (_nfs4_async_handle_error( - task, NULL, clp, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; - } - } + if (!nfs41_sequence_done(task, res)) + return; + if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } dprintk("<-- %s\n", __func__); } @@ -5325,28 +5389,30 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { }; #endif -/* - * Per minor version reboot and network partition recovery ops - */ - -struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { - &nfs40_reboot_recovery_ops, -#if defined(CONFIG_NFS_V4_1) - &nfs41_reboot_recovery_ops, -#endif +static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { + .minor_version = 0, + .call_sync = _nfs4_call_sync, + .validate_stateid = nfs4_validate_delegation_stateid, + .reboot_recovery_ops = &nfs40_reboot_recovery_ops, + .nograce_recovery_ops = &nfs40_nograce_recovery_ops, + .state_renewal_ops = &nfs40_state_renewal_ops, }; -struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { - &nfs40_nograce_recovery_ops, #if defined(CONFIG_NFS_V4_1) - &nfs41_nograce_recovery_ops, -#endif +static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { + .minor_version = 1, + .call_sync = _nfs4_call_sync_session, + .validate_stateid = nfs41_validate_delegation_stateid, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, }; +#endif -struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { - &nfs40_state_renewal_ops, +const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { + [0] = &nfs_v4_0_minor_ops, #if defined(CONFIG_NFS_V4_1) - &nfs41_state_renewal_ops, + [1] = &nfs_v4_1_minor_ops, #endif }; diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index d87f103..72b6c58 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -54,14 +54,14 @@ void nfs4_renew_state(struct work_struct *work) { - struct nfs4_state_maintenance_ops *ops; + const struct nfs4_state_maintenance_ops *ops; struct nfs_client *clp = container_of(work, struct nfs_client, cl_renewd.work); struct rpc_cred *cred; long lease; unsigned long last, now; - ops = nfs4_state_renewal_ops[clp->cl_minorversion]; + ops = clp->cl_mvops->state_renewal_ops; dprintk("%s: start\n", __func__); /* Are there any active superblocks? */ if (list_empty(&clp->cl_superblocks)) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 34acf59..3e2f19b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -145,7 +145,9 @@ static void nfs4_end_drain_session(struct nfs_client *clp) struct nfs4_session *ses = clp->cl_session; int max_slots; - if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { + if (ses == NULL) + return; + if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { spin_lock(&ses->fc_slot_table.slot_tbl_lock); max_slots = ses->fc_slot_table.max_slots; while (max_slots--) { @@ -167,7 +169,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) struct nfs4_slot_table *tbl = &ses->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); + set_bit(NFS4_SESSION_DRAINING, &ses->session_state); if (tbl->highest_used_slotid != -1) { INIT_COMPLETION(ses->complete); spin_unlock(&tbl->slot_tbl_lock); @@ -371,7 +373,6 @@ nfs4_alloc_state_owner(void) return NULL; spin_lock_init(&sp->so_lock); INIT_LIST_HEAD(&sp->so_states); - INIT_LIST_HEAD(&sp->so_delegations); rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); sp->so_seqid.sequence = &sp->so_sequence; spin_lock_init(&sp->so_sequence.lock); @@ -384,7 +385,7 @@ static void nfs4_drop_state_owner(struct nfs4_state_owner *sp) { if (!RB_EMPTY_NODE(&sp->so_client_node)) { - struct nfs_client *clp = sp->so_client; + struct nfs_client *clp = sp->so_server->nfs_client; spin_lock(&clp->cl_lock); rb_erase(&sp->so_client_node, &clp->cl_state_owners); @@ -406,7 +407,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct new = nfs4_alloc_state_owner(); if (new == NULL) return NULL; - new->so_client = clp; new->so_server = server; new->so_cred = cred; spin_lock(&clp->cl_lock); @@ -423,7 +423,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct void nfs4_put_state_owner(struct nfs4_state_owner *sp) { - struct nfs_client *clp = sp->so_client; + struct nfs_client *clp = sp->so_server->nfs_client; struct rpc_cred *cred = sp->so_cred; if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) @@ -602,12 +602,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) * that is compatible with current->files */ static struct nfs4_lock_state * -__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *pos; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner != fl_owner) + if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) continue; + switch (pos->ls_owner.lo_type) { + case NFS4_POSIX_LOCK_TYPE: + if (pos->ls_owner.lo_u.posix_owner != fl_owner) + continue; + break; + case NFS4_FLOCK_LOCK_TYPE: + if (pos->ls_owner.lo_u.flock_owner != fl_pid) + continue; + } atomic_inc(&pos->ls_count); return pos; } @@ -619,10 +628,10 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *lsp; - struct nfs_client *clp = state->owner->so_client; + struct nfs_client *clp = state->owner->so_server->nfs_client; lsp = kzalloc(sizeof(*lsp), GFP_NOFS); if (lsp == NULL) @@ -633,7 +642,18 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f lsp->ls_seqid.sequence = &lsp->ls_sequence; atomic_set(&lsp->ls_count, 1); lsp->ls_state = state; - lsp->ls_owner = fl_owner; + lsp->ls_owner.lo_type = type; + switch (lsp->ls_owner.lo_type) { + case NFS4_FLOCK_LOCK_TYPE: + lsp->ls_owner.lo_u.flock_owner = fl_pid; + break; + case NFS4_POSIX_LOCK_TYPE: + lsp->ls_owner.lo_u.posix_owner = fl_owner; + break; + default: + kfree(lsp); + return NULL; + } spin_lock(&clp->cl_lock); nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); spin_unlock(&clp->cl_lock); @@ -643,7 +663,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) { - struct nfs_client *clp = lsp->ls_state->owner->so_client; + struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; spin_lock(&clp->cl_lock); nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); @@ -657,13 +677,13 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) +static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) { struct nfs4_lock_state *lsp, *new = NULL; for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner); + lsp = __nfs4_find_lock_state(state, owner, pid, type); if (lsp != NULL) break; if (new != NULL) { @@ -674,7 +694,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ break; } spin_unlock(&state->state_lock); - new = nfs4_alloc_lock_state(state, owner); + new = nfs4_alloc_lock_state(state, owner, pid, type); if (new == NULL) return NULL; } @@ -701,6 +721,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); + if (lsp->ls_flags & NFS_LOCK_INITIALIZED) + nfs4_release_lockowner(lsp); nfs4_free_lock_state(lsp); } @@ -728,7 +750,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_ops != NULL) return 0; - lsp = nfs4_get_lock_state(state, fl->fl_owner); + if (fl->fl_flags & FL_POSIX) + lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); + else if (fl->fl_flags & FL_FLOCK) + lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); + else + return -EINVAL; if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -740,7 +767,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) +void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) { struct nfs4_lock_state *lsp; int seq; @@ -753,7 +780,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f return; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner); + lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); spin_unlock(&state->state_lock); @@ -1041,11 +1068,11 @@ restart: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - nfs4_state_mark_reclaim_nograce(sp->so_client, state); + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); break; case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: - nfs4_state_mark_reclaim_nograce(sp->so_client, state); + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -1120,8 +1147,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) return; - nfs4_reclaim_complete(clp, - nfs4_reboot_recovery_ops[clp->cl_minorversion]); + nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); @@ -1211,8 +1237,8 @@ restart: static int nfs4_check_lease(struct nfs_client *clp) { struct rpc_cred *cred; - struct nfs4_state_maintenance_ops *ops = - nfs4_state_renewal_ops[clp->cl_minorversion]; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; int status = -NFS4ERR_EXPIRED; /* Is the client already known to have an expired lease? */ @@ -1235,8 +1261,8 @@ out: static int nfs4_reclaim_lease(struct nfs_client *clp) { struct rpc_cred *cred; - struct nfs4_state_recovery_ops *ops = - nfs4_reboot_recovery_ops[clp->cl_minorversion]; + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; int status = -ENOENT; cred = ops->get_clid_cred(clp); @@ -1444,7 +1470,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* First recover reboot state... */ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { status = nfs4_do_reclaim(clp, - nfs4_reboot_recovery_ops[clp->cl_minorversion]); + clp->cl_mvops->reboot_recovery_ops); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) continue; @@ -1458,7 +1484,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { status = nfs4_do_reclaim(clp, - nfs4_nograce_recovery_ops[clp->cl_minorversion]); + clp->cl_mvops->nograce_recovery_ops); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 65c8dae..08ef912 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -202,14 +202,17 @@ static int nfs4_stat_to_errno(int); #define encode_link_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_lockowner_maxsz (7) #define encode_lock_maxsz (op_encode_hdr_maxsz + \ 7 + \ - 1 + encode_stateid_maxsz + 8) + 1 + encode_stateid_maxsz + 1 + \ + encode_lockowner_maxsz) #define decode_lock_denied_maxsz \ (8 + decode_lockowner_maxsz) #define decode_lock_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) -#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) +#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ + encode_lockowner_maxsz) #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ @@ -217,6 +220,11 @@ static int nfs4_stat_to_errno(int); 4) #define decode_locku_maxsz (op_decode_hdr_maxsz + \ decode_stateid_maxsz) +#define encode_release_lockowner_maxsz \ + (op_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define decode_release_lockowner_maxsz \ + (op_decode_hdr_maxsz) #define encode_access_maxsz (op_encode_hdr_maxsz + 1) #define decode_access_maxsz (op_decode_hdr_maxsz + 2) #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ @@ -471,6 +479,12 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_locku_maxsz) +#define NFS4_enc_release_lockowner_sz \ + (compound_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define NFS4_dec_release_lockowner_sz \ + (compound_decode_hdr_maxsz + \ + decode_lockowner_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -744,7 +758,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; /* initialize running count of expected bytes in reply. * NOTE: the replied tag SHOULD be the same is the one sent, @@ -1042,6 +1056,17 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl) return fl->fl_end - fl->fl_start + 1; } +static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) +{ + __be32 *p; + + p = reserve_space(xdr, 28); + p = xdr_encode_hyper(p, lowner->clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, lowner->id); +} + /* * opcode,type,reclaim,offset,length,new_lock_owner = 32 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 @@ -1058,14 +1083,11 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(args->open_seqid->sequence->counter); p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); - p = xdr_encode_hyper(p, args->lock_owner.clientid); - *p++ = cpu_to_be32(16); - p = xdr_encode_opaque_fixed(p, "lock id:", 8); - xdr_encode_hyper(p, args->lock_owner.id); + encode_lockowner(xdr, &args->lock_owner); } else { p = reserve_space(xdr, NFS4_STATEID_SIZE+4); @@ -1080,15 +1102,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar { __be32 *p; - p = reserve_space(xdr, 52); + p = reserve_space(xdr, 24); *p++ = cpu_to_be32(OP_LOCKT); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); p = xdr_encode_hyper(p, args->fl->fl_start); p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); - p = xdr_encode_hyper(p, args->lock_owner.clientid); - *p++ = cpu_to_be32(16); - p = xdr_encode_opaque_fixed(p, "lock id:", 8); - xdr_encode_hyper(p, args->lock_owner.id); + encode_lockowner(xdr, &args->lock_owner); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1108,6 +1127,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar hdr->replen += decode_locku_maxsz; } +static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); + encode_lockowner(xdr, lowner); + hdr->nops++; + hdr->replen += decode_release_lockowner_maxsz; +} + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { int len = name->len; @@ -1172,7 +1202,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op break; default: clp = arg->server->nfs_client; - if (clp->cl_minorversion > 0) { + if (clp->cl_mvops->minor_version > 0) { if (nfs4_has_persistent_session(clp)) { *p = cpu_to_be32(NFS4_CREATE_GUARDED); encode_attrs(xdr, arg->u.attrs, arg->server); @@ -1324,14 +1354,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) hdr->replen += decode_putrootfh_maxsz; } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) +static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) { nfs4_stateid stateid; __be32 *p; p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { - nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); + nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); @@ -1344,7 +1374,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_READ); - encode_stateid(xdr, args->context); + encode_stateid(xdr, args->context, args->lock_context); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1523,7 +1553,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_WRITE); - encode_stateid(xdr, args->context); + encode_stateid(xdr, args->context, args->lock_context); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -1704,7 +1734,7 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) { #if defined(CONFIG_NFS_V4_1) if (args->sa_session) - return args->sa_session->clp->cl_minorversion; + return args->sa_session->clp->cl_mvops->minor_version; #endif /* CONFIG_NFS_V4_1 */ return 0; } @@ -2048,6 +2078,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_ return 0; } +static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = 0, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_release_lockowner(&xdr, &args->lock_owner, &hdr); + encode_nops(&hdr); + return 0; +} + /* * Encode a READLINK request */ @@ -2395,7 +2439,7 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p, { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = args->client->cl_minorversion, + .minorversion = args->client->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -2413,7 +2457,7 @@ static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p, { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = args->client->cl_minorversion, + .minorversion = args->client->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -2431,7 +2475,7 @@ static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p, { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = session->clp->cl_minorversion, + .minorversion = session->clp->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -3973,6 +4017,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) return status; } +static int decode_release_lockowner(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); +} + static int decode_lookup(struct xdr_stream *xdr) { return decode_op_hdr(xdr, OP_LOOKUP); @@ -5259,6 +5308,19 @@ out: return status; } +static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (!status) + status = decode_release_lockowner(&xdr); + return status; +} + /* * Decode READLINK response */ @@ -5866,6 +5928,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(GETACL, enc_getacl, dec_getacl), PROC(SETACL, enc_setacl, dec_setacl), PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), + PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 6bd19d8..df101d9 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -105,7 +105,7 @@ static char nfs_root_name[256] __initdata = ""; static __be32 servaddr __initdata = 0; /* Name of directory to mount */ -static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, }; +static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = { 0, }; /* NFS-related data */ static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index a3654e5..9194902 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -79,6 +79,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, req->wb_pgbase = offset; req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); + req->wb_lock_context = nfs_get_lock_context(ctx); kref_init(&req->wb_kref); return req; } @@ -141,11 +142,16 @@ void nfs_clear_request(struct nfs_page *req) { struct page *page = req->wb_page; struct nfs_open_context *ctx = req->wb_context; + struct nfs_lock_context *l_ctx = req->wb_lock_context; if (page != NULL) { page_cache_release(page); req->wb_page = NULL; } + if (l_ctx != NULL) { + nfs_put_lock_context(l_ctx); + req->wb_lock_context = NULL; + } if (ctx != NULL) { put_nfs_open_context(ctx); req->wb_context = NULL; @@ -235,7 +241,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, { if (req->wb_context->cred != prev->wb_context->cred) return 0; - if (req->wb_context->lockowner != prev->wb_context->lockowner) + if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) return 0; if (req->wb_context->state != prev->wb_context->state) return 0; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6e2b06e..87adc27 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -190,6 +190,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->args.pages = data->pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; data->res.fattr = &data->fattr; data->res.count = count; @@ -410,7 +411,7 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, + if (nfs4_setup_sequence(NFS_SERVER(data->inode), &data->args.seq_args, &data->res.seq_res, 0, task)) return; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f9df16d..f1ae39f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -546,6 +546,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, { struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; + if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE) + return; + switch (sap->sa_family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sap; @@ -1780,6 +1783,7 @@ static int nfs_validate_mount_data(void *options, * can deal with. */ args->flags = data->flags & NFS_MOUNT_FLAGMASK; + args->flags |= NFS_MOUNT_LEGACY_INTERFACE; args->rsize = data->rsize; args->wsize = data->wsize; args->timeo = data->timeo; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index a2242af..2f84ada 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata) struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); - if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + if (nfs4_setup_sequence(server, &data->args.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 91679e2..874972d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -222,7 +222,7 @@ static void nfs_end_page_writeback(struct page *page) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -static struct nfs_page *nfs_find_and_lock_request(struct page *page) +static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) { struct inode *inode = page->mapping->host; struct nfs_page *req; @@ -241,7 +241,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page) * request as dirty (in which case we don't care). */ spin_unlock(&inode->i_lock); - ret = nfs_wait_on_request(req); + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; nfs_release_request(req); if (ret != 0) return ERR_PTR(ret); @@ -256,12 +259,12 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) + struct page *page, bool nonblock) { struct nfs_page *req; int ret = 0; - req = nfs_find_and_lock_request(page); + req = nfs_find_and_lock_request(page, nonblock); if (!req) goto out; ret = PTR_ERR(req); @@ -283,12 +286,20 @@ out: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { struct inode *inode = page->mapping->host; + int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page->index); - return nfs_page_async_flush(pgio, page); + ret = nfs_page_async_flush(pgio, page, + wbc->sync_mode == WB_SYNC_NONE || + wbc->nonblocking != 0); + if (ret == -EAGAIN) { + redirty_page_for_writepage(wbc, page); + ret = 0; + } + return ret; } /* @@ -689,7 +700,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page) req = nfs_page_find_request(page); if (req == NULL) return 0; - do_flush = req->wb_page != page || req->wb_context != ctx; + do_flush = req->wb_page != page || req->wb_context != ctx || + req->wb_lock_context->lockowner != current->files || + req->wb_lock_context->pid != current->tgid; nfs_release_request(req); if (!do_flush) return 0; @@ -813,6 +826,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.pages = data->pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { data->args.stable = NFS_DATA_SYNC; @@ -1036,9 +1050,9 @@ out: void nfs_write_prepare(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; - if (nfs4_setup_sequence(clp, &data->args.seq_args, + if (nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); @@ -1379,7 +1393,7 @@ static const struct rpc_call_ops nfs_commit_ops = { .rpc_release = nfs_commit_release, }; -static int nfs_commit_inode(struct inode *inode, int how) +int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int may_wait = how & FLUSH_SYNC; @@ -1443,11 +1457,6 @@ out_mark_dirty: return ret; } #else -static int nfs_commit_inode(struct inode *inode, int how) -{ - return 0; -} - static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { return 0; @@ -1546,7 +1555,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - req = nfs_find_and_lock_request(page); + req = nfs_find_and_lock_request(page, false); ret = PTR_ERR(req); if (IS_ERR(req)) goto out; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 3d68f45..5b7e302 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -168,7 +168,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_read(rqstp, &resp->fh, NULL, + nfserr = nfsd_read(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, argp->vlen, &resp->count); @@ -271,7 +271,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, fh_init(&resp->fh, NFS3_FHSIZE); nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, &argp->attrs, S_IFDIR, 0, &resp->fh); - + fh_unlock(&resp->dirfh); RETURN_STATUS(nfserr); } @@ -327,7 +327,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp, type = nfs3_ftypes[argp->ftype]; nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, &argp->attrs, type, rdev, &resp->fh); - + fh_unlock(&resp->dirfh); RETURN_STATUS(nfserr); } @@ -348,6 +348,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, /* Unlink. -S_IFDIR means file must not be a directory */ fh_copy(&resp->fh, &argp->fh); nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); + fh_unlock(&resp->fh); RETURN_STATUS(nfserr); } @@ -367,6 +368,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, fh_copy(&resp->fh, &argp->fh); nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); + fh_unlock(&resp->fh); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index eb78e7e..988cbb3 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -143,8 +143,6 @@ struct nfs4_cb_compound_hdr { u32 minorversion; /* res */ int status; - u32 taglen; - char *tag; }; static struct { @@ -205,6 +203,16 @@ nfs_cb_stat_to_errno(int stat) */ static void +encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +{ + __be32 *p; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); +} + +static void encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { __be32 * p; @@ -229,10 +237,10 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, __be32 *p; int len = dp->dl_fh.fh_size; - RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); + RESERVE_SPACE(4); WRITE32(OP_CB_RECALL); - WRITE32(dp->dl_stateid.si_generation); - WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); + encode_stateid(xdr, &dp->dl_stateid); + RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); WRITE32(0); /* truncate optimization not implemented */ WRITE32(len); WRITEMEM(&dp->dl_fh.fh_base, len); @@ -293,13 +301,14 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, static int decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ __be32 *p; + u32 taglen; READ_BUF(8); READ32(hdr->status); - READ32(hdr->taglen); - READ_BUF(hdr->taglen + 4); - hdr->tag = (char *)p; - p += XDR_QUADLEN(hdr->taglen); + /* We've got no use for the tag; ignore it: */ + READ32(taglen); + READ_BUF(taglen + 4); + p += XDR_QUADLEN(taglen); READ32(hdr->nops); return 0; } @@ -667,28 +676,28 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) } switch (task->tk_status) { - case -EIO: + case 0: + return; + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* Race: client probably got cb_recall + * before open reply granting delegation */ + break; + default: /* Network partition? */ atomic_set(&clp->cl_cb_set, 0); warn_no_callback_path(clp, task->tk_status); if (current_rpc_client != task->tk_client) { /* queue a callback on the new connection: */ + atomic_inc(&dp->dl_count); nfsd4_cb_recall(dp); return; } - case -EBADHANDLE: - case -NFS4ERR_BAD_STATEID: - /* Race: client probably got cb_recall - * before open reply granting delegation */ - break; - default: - /* success, or error we can't handle */ - return; } if (dp->dl_retries--) { rpc_delay(task, 2*HZ); task->tk_status = 0; - rpc_restart_call(task); + rpc_restart_call_prepare(task); return; } else { atomic_set(&clp->cl_cb_set, 0); @@ -752,18 +761,16 @@ static void _nfsd4_cb_recall(struct nfs4_delegation *dp) .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], .rpc_cred = callback_cred }; - int status; - if (clnt == NULL) + if (clnt == NULL) { + nfs4_put_delegation(dp); return; /* Client is shutting down; give up. */ + } args->args_op = dp; msg.rpc_argp = args; dp->dl_retries = 1; - status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, - &nfsd4_cb_recall_ops, dp); - if (status) - nfs4_put_delegation(dp); + rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp); } void nfsd4_do_callback_rpc(struct work_struct *w) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4a27347..2e73571 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -51,7 +51,6 @@ static time_t boot_time; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; -static u32 nfs4_init; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ static u64 current_sessionid = 1; @@ -163,6 +162,46 @@ static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; static struct list_head file_hashtbl[FILE_HASH_SIZE]; static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; +static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) +{ + BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); + atomic_inc(&fp->fi_access[oflag]); +} + +static void nfs4_file_get_access(struct nfs4_file *fp, int oflag) +{ + if (oflag == O_RDWR) { + __nfs4_file_get_access(fp, O_RDONLY); + __nfs4_file_get_access(fp, O_WRONLY); + } else + __nfs4_file_get_access(fp, oflag); +} + +static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) +{ + if (fp->fi_fds[oflag]) { + fput(fp->fi_fds[oflag]); + fp->fi_fds[oflag] = NULL; + } +} + +static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) +{ + if (atomic_dec_and_test(&fp->fi_access[oflag])) { + nfs4_file_put_fd(fp, O_RDWR); + nfs4_file_put_fd(fp, oflag); + } +} + +static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) +{ + if (oflag == O_RDWR) { + __nfs4_file_put_access(fp, O_RDONLY); + __nfs4_file_put_access(fp, O_WRONLY); + } else + __nfs4_file_put_access(fp, oflag); +} + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -171,6 +210,13 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn; dprintk("NFSD alloc_init_deleg\n"); + /* + * Major work on the lease subsystem (for example, to support + * calbacks on stat) will be required before we can support + * write delegations properly. + */ + if (type != NFS4_OPEN_DELEGATE_READ) + return NULL; if (fp->fi_had_conflict) return NULL; if (num_delegations > max_delegations) @@ -185,9 +231,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; + nfs4_file_get_access(fp, O_RDONLY); dp->dl_flock = NULL; - get_file(stp->st_vfs_file); - dp->dl_vfs_file = stp->st_vfs_file; dp->dl_type = type; dp->dl_ident = cb->cb_ident; dp->dl_stateid.si_boot = boot_time; @@ -222,15 +267,12 @@ nfs4_put_delegation(struct nfs4_delegation *dp) static void nfs4_close_delegation(struct nfs4_delegation *dp) { - struct file *filp = dp->dl_vfs_file; + struct file *filp = find_readable_file(dp->dl_file); dprintk("NFSD: close_delegation dp %p\n",dp); - dp->dl_vfs_file = NULL; - /* The following nfsd_close may not actually close the file, - * but we want to remove the lease in any case. */ if (dp->dl_flock) vfs_setlease(filp, F_UNLCK, &dp->dl_flock); - nfsd_close(filp); + nfs4_file_put_access(dp->dl_file, O_RDONLY); } /* Called under the state lock. */ @@ -302,8 +344,12 @@ static void free_generic_stateid(struct nfs4_stateid *stp) static void release_lock_stateid(struct nfs4_stateid *stp) { + struct file *file; + unhash_generic_stateid(stp); - locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner); + file = find_any_file(stp->st_file); + if (file) + locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); free_generic_stateid(stp); } @@ -341,11 +387,85 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp) } } +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation of rfc 3530 14.2.19 that the server + * return an error if the client attempt to downgrade to a combination + * of share bits not explicable by closing some of its previous opens. + * + * XXX: This enforcement is actually incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + */ +static void +set_access(unsigned int *access, unsigned long bmap) { + int i; + + *access = 0; + for (i = 1; i < 4; i++) { + if (test_bit(i, &bmap)) + *access |= i; + } +} + +static void +set_deny(unsigned int *deny, unsigned long bmap) { + int i; + + *deny = 0; + for (i = 0; i < 4; i++) { + if (test_bit(i, &bmap)) + *deny |= i ; + } +} + +static int +test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { + unsigned int access, deny; + + set_access(&access, stp->st_access_bmap); + set_deny(&deny, stp->st_deny_bmap); + if ((access & open->op_share_deny) || (deny & open->op_share_access)) + return 0; + return 1; +} + +static int nfs4_access_to_omode(u32 access) +{ + switch (access) { + case NFS4_SHARE_ACCESS_READ: + return O_RDONLY; + case NFS4_SHARE_ACCESS_WRITE: + return O_WRONLY; + case NFS4_SHARE_ACCESS_BOTH: + return O_RDWR; + } + BUG(); +} + +static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp) +{ + unsigned int access; + + set_access(&access, stp->st_access_bmap); + return nfs4_access_to_omode(access); +} + static void release_open_stateid(struct nfs4_stateid *stp) { + int oflag = nfs4_access_bmap_to_omode(stp); + unhash_generic_stateid(stp); release_stateid_lockowners(stp); - nfsd_close(stp->st_vfs_file); + nfs4_file_put_access(stp->st_file, oflag); free_generic_stateid(stp); } @@ -457,7 +577,7 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) spin_unlock(&nfsd_drc_lock); if (fchan->maxreqs == 0) - return nfserr_serverfault; + return nfserr_jukebox; fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; return 0; @@ -542,7 +662,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) + sizeof(struct nfsd4_session) > PAGE_SIZE); - status = nfserr_serverfault; + status = nfserr_jukebox; /* allocate struct nfsd4_session and slot table pointers in one piece */ slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *); new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); @@ -591,10 +711,8 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) dump_sessionid(__func__, sessionid); idx = hash_sessionid(sessionid); - dprintk("%s: idx is %d\n", __func__, idx); /* Search in the appropriate list */ list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { - dump_sessionid("list traversal", &elem->se_sessionid); if (!memcmp(elem->se_sessionid.data, sessionid->data, NFS4_MAX_SESSIONID_LEN)) { return elem; @@ -714,7 +832,6 @@ release_session_client(struct nfsd4_session *session) } else renew_client_locked(clp); spin_unlock(&client_lock); - nfsd4_put_session(session); } /* must be called under the client_lock */ @@ -1220,7 +1337,7 @@ out_new: /* Normal case */ new = create_client(exid->clname, dname, rqstp, &verf); if (new == NULL) { - status = nfserr_serverfault; + status = nfserr_jukebox; goto out; } @@ -1760,6 +1877,8 @@ alloc_init_file(struct inode *ino) fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); spin_lock(&recall_lock); list_add(&fp->fi_hash, &file_hashtbl[hashval]); spin_unlock(&recall_lock); @@ -1971,57 +2090,6 @@ static inline int deny_valid(u32 x) } /* - * We store the NONE, READ, WRITE, and BOTH bits separately in the - * st_{access,deny}_bmap field of the stateid, in order to track not - * only what share bits are currently in force, but also what - * combinations of share bits previous opens have used. This allows us - * to enforce the recommendation of rfc 3530 14.2.19 that the server - * return an error if the client attempt to downgrade to a combination - * of share bits not explicable by closing some of its previous opens. - * - * XXX: This enforcement is actually incomplete, since we don't keep - * track of access/deny bit combinations; so, e.g., we allow: - * - * OPEN allow read, deny write - * OPEN allow both, deny none - * DOWNGRADE allow read, deny none - * - * which we should reject. - */ -static void -set_access(unsigned int *access, unsigned long bmap) { - int i; - - *access = 0; - for (i = 1; i < 4; i++) { - if (test_bit(i, &bmap)) - *access |= i; - } -} - -static void -set_deny(unsigned int *deny, unsigned long bmap) { - int i; - - *deny = 0; - for (i = 0; i < 4; i++) { - if (test_bit(i, &bmap)) - *deny |= i ; - } -} - -static int -test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { - unsigned int access, deny; - - set_access(&access, stp->st_access_bmap); - set_deny(&deny, stp->st_deny_bmap); - if ((access & open->op_share_deny) || (deny & open->op_share_access)) - return 0; - return 1; -} - -/* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid */ @@ -2052,14 +2120,12 @@ out: } static inline void -nfs4_file_downgrade(struct file *filp, unsigned int share_access) +nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) { - if (share_access & NFS4_SHARE_ACCESS_WRITE) { - drop_file_write_access(filp); - spin_lock(&filp->f_lock); - filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; - spin_unlock(&filp->f_lock); - } + if (share_access & NFS4_SHARE_ACCESS_WRITE) + nfs4_file_put_access(fp, O_WRONLY); + if (share_access & NFS4_SHARE_ACCESS_READ) + nfs4_file_put_access(fp, O_RDONLY); } /* @@ -2255,6 +2321,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid) return NULL; } +int share_access_to_flags(u32 share_access) +{ + share_access &= ~NFS4_SHARE_WANT_MASK; + + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; +} + static __be32 nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_delegation **dp) @@ -2265,8 +2338,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, *dp = find_delegation_file(fp, &open->op_delegate_stateid); if (*dp == NULL) goto out; - flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ? - RD_STATE : WR_STATE; + flags = share_access_to_flags(open->op_share_access); status = nfs4_check_delegmode(*dp, flags); if (status) *dp = NULL; @@ -2308,30 +2380,53 @@ nfs4_alloc_stateid(void) return kmem_cache_alloc(stateid_slab, GFP_KERNEL); } +static inline int nfs4_access_to_access(u32 nfs4_access) +{ + int flags = 0; + + if (nfs4_access & NFS4_SHARE_ACCESS_READ) + flags |= NFSD_MAY_READ; + if (nfs4_access & NFS4_SHARE_ACCESS_WRITE) + flags |= NFSD_MAY_WRITE; + return flags; +} + +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file +*fp, struct svc_fh *cur_fh, u32 nfs4_access) +{ + __be32 status; + int oflag = nfs4_access_to_omode(nfs4_access); + int access = nfs4_access_to_access(nfs4_access); + + if (!fp->fi_fds[oflag]) { + status = nfsd_open(rqstp, cur_fh, S_IFREG, access, + &fp->fi_fds[oflag]); + if (status == nfserr_dropit) + status = nfserr_jukebox; + if (status) + return status; + } + nfs4_file_get_access(fp, oflag); + + return nfs_ok; +} + static __be32 nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, - struct nfs4_delegation *dp, - struct svc_fh *cur_fh, int flags) + struct nfs4_file *fp, struct svc_fh *cur_fh, + struct nfsd4_open *open) { struct nfs4_stateid *stp; + __be32 status; stp = nfs4_alloc_stateid(); if (stp == NULL) return nfserr_resource; - if (dp) { - get_file(dp->dl_vfs_file); - stp->st_vfs_file = dp->dl_vfs_file; - } else { - __be32 status; - status = nfsd_open(rqstp, cur_fh, S_IFREG, flags, - &stp->st_vfs_file); - if (status) { - if (status == nfserr_dropit) - status = nfserr_jukebox; - kmem_cache_free(stateid_slab, stp); - return status; - } + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access); + if (status) { + kmem_cache_free(stateid_slab, stp); + return status; } *stpp = stp; return 0; @@ -2353,35 +2448,30 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) { - struct file *filp = stp->st_vfs_file; - struct inode *inode = filp->f_path.dentry->d_inode; - unsigned int share_access, new_writer; + u32 op_share_access, new_access; __be32 status; - set_access(&share_access, stp->st_access_bmap); - new_writer = (~share_access) & open->op_share_access - & NFS4_SHARE_ACCESS_WRITE; - - if (new_writer) { - int err = get_write_access(inode); - if (err) - return nfserrno(err); - err = mnt_want_write(cur_fh->fh_export->ex_path.mnt); - if (err) - return nfserrno(err); - file_take_write(filp); + set_access(&new_access, stp->st_access_bmap); + new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK; + + if (new_access) { + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access); + if (status) + return status; } status = nfsd4_truncate(rqstp, cur_fh, open); if (status) { - if (new_writer) - put_write_access(inode); + if (new_access) { + int oflag = nfs4_access_to_omode(new_access); + nfs4_file_put_access(fp, oflag); + } return status; } /* remember the open */ - filp->f_mode |= open->op_share_access; - __set_bit(open->op_share_access, &stp->st_access_bmap); + op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + __set_bit(op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); return nfs_ok; @@ -2444,13 +2534,14 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl.fl_end = OFFSET_MAX; fl.fl_owner = (fl_owner_t)dp; - fl.fl_file = stp->st_vfs_file; + fl.fl_file = find_readable_file(stp->st_file); + BUG_ON(!fl.fl_file); fl.fl_pid = current->tgid; /* vfs_setlease checks to see if delegation should be handed out. * the lock_manager callbacks fl_mylease and fl_change are used */ - if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) { + if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) { dprintk("NFSD: setlease failed [%d], no delegation\n", status); unhash_delegation(dp); flag = NFS4_OPEN_DELEGATE_NONE; @@ -2514,18 +2605,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf */ if (stp) { /* Stateid was found, this is an OPEN upgrade */ - status = nfs4_upgrade_open(rqstp, current_fh, stp, open); + status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) goto out; update_stateid(&stp->st_stateid); } else { - /* Stateid was not found, this is a new OPEN */ - int flags = 0; - if (open->op_share_access & NFS4_SHARE_ACCESS_READ) - flags |= NFSD_MAY_READ; - if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - flags |= NFSD_MAY_WRITE; - status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags); + status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); if (status) goto out; init_stateid(stp, fp, open); @@ -2727,7 +2812,7 @@ search_close_lru(u32 st_id, int flags) static inline int nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) { - return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; } static int @@ -2760,6 +2845,9 @@ __be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) { __be32 status = nfserr_openmode; + /* For lock stateid's, we test the parent open, not the lock: */ + if (stp->st_openstp) + stp = stp->st_openstp; if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) goto out; if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) @@ -2872,7 +2960,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, goto out; renew_client(dp->dl_client); if (filpp) - *filpp = dp->dl_vfs_file; + *filpp = find_readable_file(dp->dl_file); + BUG_ON(!*filpp); } else { /* open or lock stateid */ stp = find_stateid(stateid, flags); if (!stp) @@ -2889,8 +2978,13 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (status) goto out; renew_client(stp->st_stateowner->so_client); - if (filpp) - *filpp = stp->st_vfs_file; + if (filpp) { + if (flags & RD_STATE) + *filpp = find_readable_file(stp->st_file); + else + *filpp = find_writeable_file(stp->st_file); + BUG_ON(!*filpp); /* assured by check_openmode */ + } } status = nfs_ok; out: @@ -3126,8 +3220,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, goto out; } set_access(&share_access, stp->st_access_bmap); - nfs4_file_downgrade(stp->st_vfs_file, - share_access & ~od->od_share_access); + nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access); reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap); reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); @@ -3346,11 +3439,9 @@ static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { struct nfs4_stateowner *sop; - unsigned int hval; if (fl->fl_lmops == &nfsd_posix_mng_ops) { sop = (struct nfs4_stateowner *) fl->fl_owner; - hval = lockownerid_hashval(sop->so_id); kref_get(&sop->so_ref); deny->ld_sop = sop; deny->ld_clientid = sop->so_client->cl_clientid; @@ -3446,8 +3537,6 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc stp->st_stateid.si_stateownerid = sop->so_id; stp->st_stateid.si_fileid = fp->fi_id; stp->st_stateid.si_generation = 0; - stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */ - stp->st_access_bmap = open_stp->st_access_bmap; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; @@ -3547,7 +3636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, lock_sop = lock->lk_replay_owner; } /* lock->lk_replay_owner and lock_stp have been created or found */ - filp = lock_stp->st_vfs_file; status = nfserr_grace; if (locks_in_grace() && !lock->lk_reclaim) @@ -3560,11 +3648,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (lock->lk_type) { case NFS4_READ_LT: case NFS4_READW_LT: + filp = find_readable_file(lock_stp->st_file); file_lock.fl_type = F_RDLCK; cmd = F_SETLK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: + filp = find_writeable_file(lock_stp->st_file); file_lock.fl_type = F_WRLCK; cmd = F_SETLK; break; @@ -3572,6 +3662,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_inval; goto out; } + if (!filp) { + status = nfserr_openmode; + goto out; + } file_lock.fl_owner = (fl_owner_t)lock_sop; file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; @@ -3740,7 +3834,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &locku->lu_stateowner, &stp, NULL))) goto out; - filp = stp->st_vfs_file; + filp = find_any_file(stp->st_file); + if (!filp) { + status = nfserr_lock_range; + goto out; + } BUG_ON(!filp); locks_init_lock(&file_lock); file_lock.fl_type = F_UNLCK; @@ -3787,10 +3885,10 @@ out_nfserr: * 0: no locks held by lockowner */ static int -check_for_locks(struct file *filp, struct nfs4_stateowner *lowner) +check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) { struct file_lock **flpp; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = filp->fi_inode; int status = 0; lock_kernel(); @@ -3841,7 +3939,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, continue; list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_vfs_file, sop)) + if (check_for_locks(stp->st_file, sop)) goto out; /* Note: so_perclient unused for lockowners, * so it's OK to fool with here. */ @@ -4066,16 +4164,8 @@ out_free_laundry: int nfs4_state_start(void) { - int ret; - - if (nfs4_init) - return 0; nfsd4_load_reboot_recovery_data(); - ret = __nfs4_state_start(); - if (ret) - return ret; - nfs4_init = 1; - return 0; + return __nfs4_state_start(); } static void @@ -4110,7 +4200,6 @@ __nfs4_state_shutdown(void) } nfsd4_shutdown_recdir(); - nfs4_init = 0; } void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ac17a70..f8931ac 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2630,7 +2630,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } read->rd_vlen = v; - nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); @@ -3325,6 +3325,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo } /* Renew the clientid on success and on replay */ release_session_client(cs->session); + nfsd4_put_session(cs->session); } return 1; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 508941c..b53b1d0 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -949,15 +949,12 @@ static ssize_t __write_ports_addfd(char *buf) if (err != 0) return err; - err = lockd_up(); - if (err != 0) - goto out; - err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); - if (err < 0) - lockd_down(); + if (err < 0) { + svc_destroy(nfsd_serv); + return err; + } -out: /* Decrease the count, but don't shut down the service */ nfsd_serv->sv_nrthreads--; return err; @@ -978,9 +975,6 @@ static ssize_t __write_ports_delfd(char *buf) if (nfsd_serv != NULL) len = svc_sock_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT, toclose); - if (len >= 0) - lockd_down(); - kfree(toclose); return len; } @@ -1014,6 +1008,9 @@ static ssize_t __write_ports_addxprt(char *buf) PF_INET6, port, SVC_SOCK_ANONYMOUS); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; + + /* Decrease the count, but don't shut down the service */ + nfsd_serv->sv_nrthreads--; return 0; out_close: xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); @@ -1022,8 +1019,7 @@ out_close: svc_xprt_put(xprt); } out_err: - /* Decrease the count, but don't shut down the service */ - nfsd_serv->sv_nrthreads--; + svc_destroy(nfsd_serv); return err; } @@ -1194,7 +1190,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) bsize = NFSSVC_MAXBLKSIZE; bsize &= ~(1024-1); mutex_lock(&nfsd_mutex); - if (nfsd_serv && nfsd_serv->sv_nrthreads) { + if (nfsd_serv) { mutex_unlock(&nfsd_mutex); return -EBUSY; } @@ -1310,6 +1306,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) return -EINVAL; status = nfs4_reset_recoverydir(recdir); + if (status) + return status; } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 7237776..b76ac3a 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -153,6 +153,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) #define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) #define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE) #define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) #define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) #define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a047ad6..08e1726 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -144,7 +144,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp, svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); resp->count = argp->count; - nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, + nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, argp->vlen, &resp->count); @@ -290,7 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, * gospel of sun micro */ if (type != S_IFREG) { - int is_borc = 0; if (type != S_IFBLK && type != S_IFCHR) { rdev = 0; } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) { @@ -298,7 +297,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, type = S_IFIFO; } else { /* Okay, char or block special */ - is_borc = 1; if (!rdev) rdev = wanted; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 06b2a26..e2c4346 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -180,15 +180,80 @@ int nfsd_nrthreads(void) return rv; } +static int nfsd_init_socks(int port) +{ + int error; + if (!list_empty(&nfsd_serv->sv_permsocks)) + return 0; + + error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, + SVC_SOCK_DEFAULTS); + if (error < 0) + return error; + + error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, + SVC_SOCK_DEFAULTS); + if (error < 0) + return error; + + return 0; +} + +static bool nfsd_up = false; + +static int nfsd_startup(unsigned short port, int nrservs) +{ + int ret; + + if (nfsd_up) + return 0; + /* + * Readahead param cache - will no-op if it already exists. + * (Note therefore results will be suboptimal if number of + * threads is modified after nfsd start.) + */ + ret = nfsd_racache_init(2*nrservs); + if (ret) + return ret; + ret = nfsd_init_socks(port); + if (ret) + goto out_racache; + ret = lockd_up(); + if (ret) + goto out_racache; + ret = nfs4_state_start(); + if (ret) + goto out_lockd; + nfsd_up = true; + return 0; +out_lockd: + lockd_down(); +out_racache: + nfsd_racache_shutdown(); + return ret; +} + +static void nfsd_shutdown(void) +{ + /* + * write_ports can create the server without actually starting + * any threads--if we get shut down before any threads are + * started, then nfsd_last_thread will be run before any of this + * other initialization has been done. + */ + if (!nfsd_up) + return; + nfs4_state_shutdown(); + lockd_down(); + nfsd_racache_shutdown(); + nfsd_up = false; +} + static void nfsd_last_thread(struct svc_serv *serv) { /* When last nfsd thread exits we need to do some clean-up */ - struct svc_xprt *xprt; - list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) - lockd_down(); nfsd_serv = NULL; - nfsd_racache_shutdown(); - nfs4_state_shutdown(); + nfsd_shutdown(); printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); @@ -263,45 +328,18 @@ int nfsd_create_serv(void) nfsd_max_blksize >= 8*1024*2) nfsd_max_blksize /= 2; } + nfsd_reset_versions(); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) - err = -ENOMEM; - else - set_max_drc(); + return -ENOMEM; + set_max_drc(); do_gettimeofday(&nfssvc_boot); /* record boot time */ return err; } -static int nfsd_init_socks(int port) -{ - int error; - if (!list_empty(&nfsd_serv->sv_permsocks)) - return 0; - - error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, - SVC_SOCK_DEFAULTS); - if (error < 0) - return error; - - error = lockd_up(); - if (error < 0) - return error; - - error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, - SVC_SOCK_DEFAULTS); - if (error < 0) - return error; - - error = lockd_up(); - if (error < 0) - return error; - - return 0; -} - int nfsd_nrpools(void) { if (nfsd_serv == NULL) @@ -376,10 +414,16 @@ int nfsd_set_nrthreads(int n, int *nthreads) return err; } +/* + * Adjust the number of threads and return the new number of threads. + * This is also the function that starts the server if necessary, if + * this is the first time nrservs is nonzero. + */ int nfsd_svc(unsigned short port, int nrservs) { int error; + bool nfsd_up_before; mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); @@ -391,34 +435,29 @@ nfsd_svc(unsigned short port, int nrservs) if (nrservs == 0 && nfsd_serv == NULL) goto out; - /* Readahead param cache - will no-op if it already exists */ - error = nfsd_racache_init(2*nrservs); - if (error<0) - goto out; - error = nfs4_state_start(); + error = nfsd_create_serv(); if (error) goto out; - nfsd_reset_versions(); - - error = nfsd_create_serv(); + nfsd_up_before = nfsd_up; + error = nfsd_startup(port, nrservs); if (error) - goto out; - error = nfsd_init_socks(port); - if (error) - goto failure; - + goto out_destroy; error = svc_set_num_threads(nfsd_serv, NULL, nrservs); - if (error == 0) - /* We are holding a reference to nfsd_serv which - * we don't want to count in the return value, - * so subtract 1 - */ - error = nfsd_serv->sv_nrthreads - 1; - failure: + if (error) + goto out_shutdown; + /* We are holding a reference to nfsd_serv which + * we don't want to count in the return value, + * so subtract 1 + */ + error = nfsd_serv->sv_nrthreads - 1; +out_shutdown: + if (error < 0 && !nfsd_up_before) + nfsd_shutdown(); +out_destroy: svc_destroy(nfsd_serv); /* Release server */ - out: +out: mutex_unlock(&nfsd_mutex); return error; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 006c842..7731a75 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -88,7 +88,6 @@ struct nfs4_delegation { struct nfs4_client *dl_client; struct nfs4_file *dl_file; struct file_lock *dl_flock; - struct file *dl_vfs_file; u32 dl_type; time_t dl_time; /* For recall: */ @@ -342,12 +341,50 @@ struct nfs4_file { struct list_head fi_hash; /* hash by "struct inode *" */ struct list_head fi_stateids; struct list_head fi_delegations; + /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ + struct file * fi_fds[3]; + /* One each for O_RDONLY, O_WRONLY: */ + atomic_t fi_access[2]; + /* + * Each open stateid contributes 1 to either fi_readers or + * fi_writers, or both, depending on the open mode. A + * delegation also takes an fi_readers reference. Lock + * stateid's take none. + */ + atomic_t fi_readers; + atomic_t fi_writers; struct inode *fi_inode; u32 fi_id; /* used with stateowner->so_id * for stateid_hashtbl hash */ bool fi_had_conflict; }; +/* XXX: for first cut may fall back on returning file that doesn't work + * at all? */ +static inline struct file *find_writeable_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + return f->fi_fds[O_WRONLY]; +} + +static inline struct file *find_readable_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + return f->fi_fds[O_RDONLY]; +} + +static inline struct file *find_any_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + else if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_WRONLY]; + else + return f->fi_fds[O_RDONLY]; +} + /* * nfs4_stateid can either be an open stateid or (eventually) a lock stateid * @@ -373,7 +410,6 @@ struct nfs4_stateid { struct nfs4_stateowner * st_stateowner; struct nfs4_file * st_file; stateid_t st_stateid; - struct file * st_vfs_file; unsigned long st_access_bmap; unsigned long st_deny_bmap; struct nfs4_stateid * st_openstp; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 3c11112..9df85a1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -604,7 +604,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac return error; } -#endif /* defined(CONFIG_NFS_V4) */ +#endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 /* @@ -903,7 +903,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { struct inode *inode; - struct raparms *ra; mm_segment_t oldfs; __be32 err; int host_err; @@ -914,12 +913,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) goto out; - /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - - if (ra && ra->p_set) - file->f_ra = ra->p_ra; - if (file->f_op->splice_read && rqstp->rq_splice_ok) { struct splice_desc sd = { .len = 0, @@ -937,16 +930,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, set_fs(oldfs); } - /* Write back readahead params */ - if (ra) { - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); - } - if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; @@ -1086,8 +1069,45 @@ out: * on entry. On return, *count contains the number of bytes actually read. * N.B. After this call fhp needs an fh_put */ +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + struct file *file; + struct inode *inode; + struct raparms *ra; + __be32 err; + + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (err) + return err; + + inode = file->f_path.dentry->d_inode; + + /* Get readahead parameters */ + ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); + + if (ra && ra->p_set) + file->f_ra = ra->p_ra; + + err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); + + /* Write back readahead params */ + if (ra) { + struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + spin_lock(&rab->pb_lock); + ra->p_ra = file->f_ra; + ra->p_set = 1; + ra->p_count--; + spin_unlock(&rab->pb_lock); + } + + nfsd_close(file); + return err; +} + +/* As above, but use the provided file descriptor. */ __be32 -nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, +nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { @@ -1099,13 +1119,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (err) goto out; err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - } else { - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); - if (err) - goto out; - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - nfsd_close(file); - } + } else /* Note file may still be NULL in NFSv4 special stateid case: */ + err = nfsd_read(rqstp, fhp, offset, vec, vlen, count); out: return err; } @@ -1631,7 +1646,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *name, int len, struct svc_fh *tfhp) { struct dentry *ddir, *dnew, *dold; - struct inode *dirp, *dest; + struct inode *dirp; __be32 err; int host_err; @@ -1659,7 +1674,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, goto out_nfserr; dold = tfhp->fh_dentry; - dest = dold->d_inode; host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); if (host_err) { @@ -2038,7 +2052,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct dentry *dentry, int acc) { struct inode *inode = dentry->d_inode; - struct path path; int err; if (acc == NFSD_MAY_NOP) @@ -2111,15 +2124,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, if (err == -EACCES && S_ISREG(inode->i_mode) && acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) err = inode_permission(inode, MAY_EXEC); - if (err) - goto nfsd_out; - /* Do integrity (permission) checking now, but defer incrementing - * IMA counts to the actual file open. - */ - path.mnt = exp->ex_path.mnt; - path.dentry = dentry; -nfsd_out: return err? nfserrno(err) : 0; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 217a62c..9a370a5 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -64,7 +64,9 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, int, struct file **); void nfsd_close(struct file *); -__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, +__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, + loff_t, struct kvec *, int, unsigned long *); +__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, loff_t, struct kvec *,int, unsigned long *, int *); diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index effdbdb..3dbdc1d 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -26,6 +26,8 @@ #include "nilfs.h" #include "bmap.h" #include "sb.h" +#include "btree.h" +#include "direct.h" #include "btnode.h" #include "mdt.h" #include "dat.h" @@ -533,7 +535,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) { - memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); + memcpy(gcbmap, bmap, sizeof(*bmap)); init_rwsem(&gcbmap->b_sem); lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; @@ -541,7 +543,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) { - memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); + memcpy(bmap, gcbmap, sizeof(*bmap)); init_rwsem(&bmap->b_sem); lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index 9980d7d..a20569b 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -32,11 +32,6 @@ #define NILFS_BMAP_INVALID_PTR 0 -#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey) -#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key) -#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr) -#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr) - #define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) @@ -71,7 +66,7 @@ struct nilfs_bmap_operations { int (*bop_delete)(struct nilfs_bmap *, __u64); void (*bop_clear)(struct nilfs_bmap *); - int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *); + int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *); void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, struct list_head *); @@ -110,6 +105,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) * @b_last_allocated_ptr: last allocated ptr for data block * @b_ptr_type: pointer type * @b_state: state + * @b_nchildren_per_block: maximum number of child nodes for non-root nodes */ struct nilfs_bmap { union { @@ -123,6 +119,7 @@ struct nilfs_bmap { __u64 b_last_allocated_ptr; int b_ptr_type; int b_state; + __u16 b_nchildren_per_block; }; /* pointer type */ @@ -224,6 +221,13 @@ static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, nilfs_dat_abort_end(dat, &req->bpr_req); } +static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key, + __u64 ptr) +{ + bmap->b_last_allocated_key = key; + bmap->b_last_allocated_ptr = ptr; +} + __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, const struct buffer_head *); diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h deleted file mode 100644 index d41509b..0000000 --- a/fs/nilfs2/bmap_union.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * bmap_union.h - NILFS block mapping. - * - * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. - */ - -#ifndef _NILFS_BMAP_UNION_H -#define _NILFS_BMAP_UNION_H - -#include "bmap.h" -#include "direct.h" -#include "btree.h" - -/** - * nilfs_bmap_union - - * @bi_bmap: bmap structure - * @bi_btree: direct map structure - * @bi_direct: B-tree structure - */ -union nilfs_bmap_union { - struct nilfs_bmap bi_bmap; - struct nilfs_direct bi_direct; - struct nilfs_btree bi_btree; -}; - -#endif /* _NILFS_BMAP_UNION_H */ diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 447ce47..f78ab10 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -96,10 +96,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) } int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, struct buffer_head **pbh) + sector_t pblocknr, int mode, + struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; struct inode *inode = NILFS_BTNC_I(btnc); + struct page *page; int err; bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); @@ -107,6 +109,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, return -ENOMEM; err = -EEXIST; /* internal code */ + page = bh->b_page; if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; @@ -125,7 +128,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, } } } - lock_buffer(bh); + + if (mode == READA) { + if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { + err = -EBUSY; /* internal code */ + brelse(bh); + goto out_locked; + } + } else { /* mode == READ */ + lock_buffer(bh); + } if (buffer_uptodate(bh)) { unlock_buffer(bh); err = -EEXIST; /* internal code */ @@ -136,15 +148,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ, bh); + submit_bh(mode, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ + *submit_ptr = pblocknr; err = 0; found: *pbh = bh; out_locked: - unlock_page(bh->b_page); - page_cache_release(bh->b_page); + unlock_page(page); + page_cache_release(page); return err; } diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 07da83f..7903749 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -42,8 +42,8 @@ void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); -int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, - struct buffer_head **); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int, + struct buffer_head **, sector_t *); void nilfs_btnode_delete(struct buffer_head *); int nilfs_btnode_prepare_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b27a342..300c2bc 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -66,30 +66,10 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path) /* * B-tree node operations */ -static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr, - struct buffer_head **bhp) -{ - struct address_space *btnc = - &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; - int err; - - err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp); - if (err) - return err == -EEXIST ? 0 : err; - - wait_on_buffer(*bhp); - if (!buffer_uptodate(*bhp)) { - brelse(*bhp); - return -EIO; - } - return 0; -} - -static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, +static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp) { - struct address_space *btnc = - &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; + struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; struct buffer_head *bh; bh = nilfs_btnode_create_block(btnc, ptr); @@ -101,71 +81,55 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, return 0; } -static inline int -nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) +static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) { return node->bn_flags; } -static inline void +static void nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) { node->bn_flags = flags; } -static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) +static int nilfs_btree_node_root(const struct nilfs_btree_node *node) { return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; } -static inline int -nilfs_btree_node_get_level(const struct nilfs_btree_node *node) +static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node) { return node->bn_level; } -static inline void +static void nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) { node->bn_level = level; } -static inline int -nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) +static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) { return le16_to_cpu(node->bn_nchildren); } -static inline void +static void nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) { node->bn_nchildren = cpu_to_le16(nchildren); } -static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) +static int nilfs_btree_node_size(const struct nilfs_bmap *btree) { - return 1 << btree->bt_bmap.b_inode->i_blkbits; + return 1 << btree->b_inode->i_blkbits; } -static inline int -nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node, - const struct nilfs_btree *btree) +static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree) { - return nilfs_btree_node_root(node) ? - NILFS_BTREE_ROOT_NCHILDREN_MIN : - NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); + return btree->b_nchildren_per_block; } -static inline int -nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node, - const struct nilfs_btree *btree) -{ - return nilfs_btree_node_root(node) ? - NILFS_BTREE_ROOT_NCHILDREN_MAX : - NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); -} - -static inline __le64 * +static __le64 * nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) { return (__le64 *)((char *)(node + 1) + @@ -173,45 +137,40 @@ nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); } -static inline __le64 * -nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, - const struct nilfs_btree *btree) +static __le64 * +nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax) { - return (__le64 *)(nilfs_btree_node_dkeys(node) + - nilfs_btree_node_nchildren_max(node, btree)); + return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax); } -static inline __u64 +static __u64 nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); + return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index)); } -static inline void +static void nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) { - *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); + *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key); } -static inline __u64 -nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, int index) +static __u64 +nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index, + int ncmax) { - return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + - index)); + return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index)); } -static inline void -nilfs_btree_node_set_ptr(struct nilfs_btree *btree, - struct nilfs_btree_node *node, int index, __u64 ptr) +static void +nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr, + int ncmax) { - *(nilfs_btree_node_dptrs(node, btree) + index) = - nilfs_bmap_ptr_to_dptr(ptr); + *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr); } -static void nilfs_btree_node_init(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int flags, int level, int nchildren, +static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags, + int level, int nchildren, int ncmax, const __u64 *keys, const __u64 *ptrs) { __le64 *dkeys; @@ -223,29 +182,28 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree, nilfs_btree_node_set_nchildren(node, nchildren); dkeys = nilfs_btree_node_dkeys(node); - dptrs = nilfs_btree_node_dptrs(node, btree); + dptrs = nilfs_btree_node_dptrs(node, ncmax); for (i = 0; i < nchildren; i++) { - dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); - dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); + dkeys[i] = cpu_to_le64(keys[i]); + dptrs[i] = cpu_to_le64(ptrs[i]); } } /* Assume the buffer heads corresponding to left and right are locked. */ -static void nilfs_btree_node_move_left(struct nilfs_btree *btree, - struct nilfs_btree_node *left, +static void nilfs_btree_node_move_left(struct nilfs_btree_node *left, struct nilfs_btree_node *right, - int n) + int n, int lncmax, int rncmax) { __le64 *ldkeys, *rdkeys; __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; ldkeys = nilfs_btree_node_dkeys(left); - ldptrs = nilfs_btree_node_dptrs(left, btree); + ldptrs = nilfs_btree_node_dptrs(left, lncmax); lnchildren = nilfs_btree_node_get_nchildren(left); rdkeys = nilfs_btree_node_dkeys(right); - rdptrs = nilfs_btree_node_dptrs(right, btree); + rdptrs = nilfs_btree_node_dptrs(right, rncmax); rnchildren = nilfs_btree_node_get_nchildren(right); memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); @@ -260,21 +218,20 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, } /* Assume that the buffer heads corresponding to left and right are locked. */ -static void nilfs_btree_node_move_right(struct nilfs_btree *btree, - struct nilfs_btree_node *left, +static void nilfs_btree_node_move_right(struct nilfs_btree_node *left, struct nilfs_btree_node *right, - int n) + int n, int lncmax, int rncmax) { __le64 *ldkeys, *rdkeys; __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; ldkeys = nilfs_btree_node_dkeys(left); - ldptrs = nilfs_btree_node_dptrs(left, btree); + ldptrs = nilfs_btree_node_dptrs(left, lncmax); lnchildren = nilfs_btree_node_get_nchildren(left); rdkeys = nilfs_btree_node_dkeys(right); - rdptrs = nilfs_btree_node_dptrs(right, btree); + rdptrs = nilfs_btree_node_dptrs(right, rncmax); rnchildren = nilfs_btree_node_get_nchildren(right); memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); @@ -289,16 +246,15 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, } /* Assume that the buffer head corresponding to node is locked. */ -static void nilfs_btree_node_insert(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - __u64 key, __u64 ptr, int index) +static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index, + __u64 key, __u64 ptr, int ncmax) { __le64 *dkeys; __le64 *dptrs; int nchildren; dkeys = nilfs_btree_node_dkeys(node); - dptrs = nilfs_btree_node_dptrs(node, btree); + dptrs = nilfs_btree_node_dptrs(node, ncmax); nchildren = nilfs_btree_node_get_nchildren(node); if (index < nchildren) { memmove(dkeys + index + 1, dkeys + index, @@ -306,16 +262,15 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, memmove(dptrs + index + 1, dptrs + index, (nchildren - index) * sizeof(*dptrs)); } - dkeys[index] = nilfs_bmap_key_to_dkey(key); - dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); + dkeys[index] = cpu_to_le64(key); + dptrs[index] = cpu_to_le64(ptr); nchildren++; nilfs_btree_node_set_nchildren(node, nchildren); } /* Assume that the buffer head corresponding to node is locked. */ -static void nilfs_btree_node_delete(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - __u64 *keyp, __u64 *ptrp, int index) +static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index, + __u64 *keyp, __u64 *ptrp, int ncmax) { __u64 key; __u64 ptr; @@ -324,9 +279,9 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, int nchildren; dkeys = nilfs_btree_node_dkeys(node); - dptrs = nilfs_btree_node_dptrs(node, btree); - key = nilfs_bmap_dkey_to_key(dkeys[index]); - ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + key = le64_to_cpu(dkeys[index]); + ptr = le64_to_cpu(dptrs[index]); nchildren = nilfs_btree_node_get_nchildren(node); if (keyp != NULL) *keyp = key; @@ -382,40 +337,92 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, return s == 0; } -static inline struct nilfs_btree_node * -nilfs_btree_get_root(const struct nilfs_btree *btree) +/** + * nilfs_btree_node_broken - verify consistency of btree node + * @node: btree node block to be examined + * @size: node size (in bytes) + * @blocknr: block number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, + size_t size, sector_t blocknr) { - return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data; + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX || + (flags & NILFS_BTREE_NODE_ROOT) || + nchildren < 0 || + nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { + printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): " + "level = %d, flags = 0x%x, nchildren = %d\n", + (unsigned long long)blocknr, level, flags, nchildren); + ret = 1; + } + return ret; } -static inline struct nilfs_btree_node * +int nilfs_btree_broken_node_block(struct buffer_head *bh) +{ + int ret; + + if (buffer_nilfs_checked(bh)) + return 0; + + ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, + bh->b_size, bh->b_blocknr); + if (likely(!ret)) + set_buffer_nilfs_checked(bh); + return ret; +} + +static struct nilfs_btree_node * +nilfs_btree_get_root(const struct nilfs_bmap *btree) +{ + return (struct nilfs_btree_node *)btree->b_u.u_data; +} + +static struct nilfs_btree_node * nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_bh->b_data; } -static inline struct nilfs_btree_node * +static struct nilfs_btree_node * nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; } -static inline int nilfs_btree_height(const struct nilfs_btree *btree) +static int nilfs_btree_height(const struct nilfs_bmap *btree) { return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; } -static inline struct nilfs_btree_node * -nilfs_btree_get_node(const struct nilfs_btree *btree, +static struct nilfs_btree_node * +nilfs_btree_get_node(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path, - int level) + int level, int *ncmaxp) { - return (level == nilfs_btree_height(btree) - 1) ? - nilfs_btree_get_root(btree) : - nilfs_btree_get_nonroot_node(path, level); + struct nilfs_btree_node *node; + + if (level == nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_root(btree); + *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX; + } else { + node = nilfs_btree_get_nonroot_node(path, level); + *ncmaxp = nilfs_btree_nchildren_per_block(btree); + } + return node; } -static inline int +static int nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) { if (unlikely(nilfs_btree_node_get_level(node) != level)) { @@ -427,13 +434,83 @@ nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) return 0; } -static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, +struct nilfs_btree_readahead_info { + struct nilfs_btree_node *node; /* parent node */ + int max_ra_blocks; /* max nof blocks to read ahead */ + int index; /* current index on the parent node */ + int ncmax; /* nof children in the parent node */ +}; + +static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, + struct buffer_head **bhp, + const struct nilfs_btree_readahead_info *ra) +{ + struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct buffer_head *bh, *ra_bh; + sector_t submit_ptr = 0; + int ret; + + ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr); + if (ret) { + if (ret != -EEXIST) + return ret; + goto out_check; + } + + if (ra) { + int i, n; + __u64 ptr2; + + /* read ahead sibling nodes */ + for (n = ra->max_ra_blocks, i = ra->index + 1; + n > 0 && i < ra->ncmax; n--, i++) { + ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); + + ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA, + &ra_bh, &submit_ptr); + if (likely(!ret || ret == -EEXIST)) + brelse(ra_bh); + else if (ret != -EBUSY) + break; + if (!buffer_locked(bh)) + goto out_no_wait; + } + } + + wait_on_buffer(bh); + + out_no_wait: + if (!buffer_uptodate(bh)) { + brelse(bh); + return -EIO; + } + + out_check: + if (nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + brelse(bh); + return -EINVAL; + } + + *bhp = bh; + return 0; +} + +static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, + struct buffer_head **bhp) +{ + return __nilfs_btree_get_block(btree, ptr, bhp, NULL); +} + +static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree, struct nilfs_btree_path *path, - __u64 key, __u64 *ptrp, int minlevel) + __u64 key, __u64 *ptrp, int minlevel, + int readahead) { struct nilfs_btree_node *node; + struct nilfs_btree_readahead_info p, *ra; __u64 ptr; - int level, index, found, ret; + int level, index, found, ncmax, ret; node = nilfs_btree_get_root(btree); level = nilfs_btree_node_get_level(node); @@ -441,14 +518,27 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, return -ENOENT; found = nilfs_btree_node_lookup(node, key, &index); - ptr = nilfs_btree_node_get_ptr(btree, node, index); + ptr = nilfs_btree_node_get_ptr(node, index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); path[level].bp_bh = NULL; path[level].bp_index = index; - for (level--; level >= minlevel; level--) { - ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); + ncmax = nilfs_btree_nchildren_per_block(btree); + + while (--level >= minlevel) { + ra = NULL; + if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) { + p.node = nilfs_btree_get_node(btree, path, level + 1, + &p.ncmax); + p.index = index; + p.max_ra_blocks = 7; + ra = &p; + } + ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh, + ra); if (ret < 0) return ret; + node = nilfs_btree_get_nonroot_node(path, level); if (nilfs_btree_bad_node(node, level)) return -EINVAL; @@ -456,9 +546,9 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, found = nilfs_btree_node_lookup(node, key, &index); else index = 0; - if (index < nilfs_btree_node_nchildren_max(node, btree)) - ptr = nilfs_btree_node_get_ptr(btree, node, index); - else { + if (index < ncmax) { + ptr = nilfs_btree_node_get_ptr(node, index, ncmax); + } else { WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); /* insert */ ptr = NILFS_BMAP_INVALID_PTR; @@ -474,22 +564,24 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, return 0; } -static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, +static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, struct nilfs_btree_path *path, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; __u64 ptr; - int index, level, ret; + int index, level, ncmax, ret; node = nilfs_btree_get_root(btree); index = nilfs_btree_node_get_nchildren(node) - 1; if (index < 0) return -ENOENT; level = nilfs_btree_node_get_level(node); - ptr = nilfs_btree_node_get_ptr(btree, node, index); + ptr = nilfs_btree_node_get_ptr(node, index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); path[level].bp_bh = NULL; path[level].bp_index = index; + ncmax = nilfs_btree_nchildren_per_block(btree); for (level--; level > 0; level--) { ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); @@ -499,7 +591,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, if (nilfs_btree_bad_node(node, level)) return -EINVAL; index = nilfs_btree_node_get_nchildren(node) - 1; - ptr = nilfs_btree_node_get_ptr(btree, node, index); + ptr = nilfs_btree_node_get_ptr(node, index, ncmax); path[level].bp_index = index; } @@ -511,51 +603,45 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, return 0; } -static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, +static int nilfs_btree_lookup(const struct nilfs_bmap *btree, __u64 key, int level, __u64 *ptrp) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; - __u64 ptr; int ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); - - if (ptrp != NULL) - *ptrp = ptr; + ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0); nilfs_btree_free_path(path); return ret; } -static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, +static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, __u64 key, __u64 *ptrp, unsigned maxblocks) { - struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct nilfs_btree_path *path; struct nilfs_btree_node *node; struct inode *dat = NULL; __u64 ptr, ptr2; sector_t blocknr; int level = NILFS_BTREE_LEVEL_NODE_MIN; - int ret, cnt, index, maxlevel; + int ret, cnt, index, maxlevel, ncmax; + struct nilfs_btree_readahead_info p; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1); if (ret < 0) goto out; - if (NILFS_BMAP_USE_VBN(bmap)) { - dat = nilfs_bmap_get_dat(bmap); + if (NILFS_BMAP_USE_VBN(btree)) { + dat = nilfs_bmap_get_dat(btree); ret = nilfs_dat_translate(dat, ptr, &blocknr); if (ret < 0) goto out; @@ -566,14 +652,14 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, goto end; maxlevel = nilfs_btree_height(btree) - 1; - node = nilfs_btree_get_node(btree, path, level); + node = nilfs_btree_get_node(btree, path, level, &ncmax); index = path[level].bp_index + 1; for (;;) { while (index < nilfs_btree_node_get_nchildren(node)) { if (nilfs_btree_node_get_key(node, index) != key + cnt) goto end; - ptr2 = nilfs_btree_node_get_ptr(btree, node, index); + ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax); if (dat) { ret = nilfs_dat_translate(dat, ptr2, &blocknr); if (ret < 0) @@ -589,20 +675,24 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, break; /* look-up right sibling node */ - node = nilfs_btree_get_node(btree, path, level + 1); - index = path[level + 1].bp_index + 1; - if (index >= nilfs_btree_node_get_nchildren(node) || - nilfs_btree_node_get_key(node, index) != key + cnt) + p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax); + p.index = path[level + 1].bp_index + 1; + p.max_ra_blocks = 7; + if (p.index >= nilfs_btree_node_get_nchildren(p.node) || + nilfs_btree_node_get_key(p.node, p.index) != key + cnt) break; - ptr2 = nilfs_btree_node_get_ptr(btree, node, index); - path[level + 1].bp_index = index; + ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax); + path[level + 1].bp_index = p.index; brelse(path[level].bp_bh); path[level].bp_bh = NULL; - ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); + + ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh, + &p); if (ret < 0) goto out; node = nilfs_btree_get_nonroot_node(path, level); + ncmax = nilfs_btree_nchildren_per_block(btree); index = 0; path[level].bp_index = index; } @@ -614,7 +704,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, return ret; } -static void nilfs_btree_promote_key(struct nilfs_btree *btree, +static void nilfs_btree_promote_key(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 key) { @@ -636,16 +726,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, } } -static void nilfs_btree_do_insert(struct nilfs_btree *btree, +static void nilfs_btree_do_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; + int ncblk; if (level < nilfs_btree_height(btree) - 1) { node = nilfs_btree_get_nonroot_node(path, level); - nilfs_btree_node_insert(btree, node, *keyp, *ptrp, - path[level].bp_index); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_insert(node, path[level].bp_index, + *keyp, *ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -655,22 +747,24 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, 0)); } else { node = nilfs_btree_get_root(btree); - nilfs_btree_node_insert(btree, node, *keyp, *ptrp, - path[level].bp_index); + nilfs_btree_node_insert(node, path[level].bp_index, + *keyp, *ptrp, + NILFS_BTREE_ROOT_NCHILDREN_MAX); } } -static void nilfs_btree_carry_left(struct nilfs_btree *btree, +static void nilfs_btree_carry_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; - int nchildren, lnchildren, n, move; + int nchildren, lnchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); lnchildren = nilfs_btree_node_get_nchildren(left); + ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + lnchildren + 1) / 2 - lnchildren; @@ -680,7 +774,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, move = 1; } - nilfs_btree_node_move_left(btree, left, node, n); + nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -705,17 +799,18 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, nilfs_btree_do_insert(btree, path, level, keyp, ptrp); } -static void nilfs_btree_carry_right(struct nilfs_btree *btree, +static void nilfs_btree_carry_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - int nchildren, rnchildren, n, move; + int nchildren, rnchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); rnchildren = nilfs_btree_node_get_nchildren(right); + ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + rnchildren + 1) / 2 - rnchildren; @@ -725,7 +820,7 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, move = 1; } - nilfs_btree_node_move_right(btree, node, right, n); + nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -751,18 +846,19 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, nilfs_btree_do_insert(btree, path, level, keyp, ptrp); } -static void nilfs_btree_split(struct nilfs_btree *btree, +static void nilfs_btree_split(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; __u64 newkey; __u64 newptr; - int nchildren, n, move; + int nchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); + ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + 1) / 2; @@ -771,7 +867,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree, move = 1; } - nilfs_btree_node_move_right(btree, node, right, n); + nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -783,8 +879,8 @@ static void nilfs_btree_split(struct nilfs_btree *btree, if (move) { path[level].bp_index -= nilfs_btree_node_get_nchildren(node); - nilfs_btree_node_insert(btree, right, *keyp, *ptrp, - path[level].bp_index); + nilfs_btree_node_insert(right, path[level].bp_index, + *keyp, *ptrp, ncblk); *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; @@ -805,19 +901,21 @@ static void nilfs_btree_split(struct nilfs_btree *btree, path[level + 1].bp_index++; } -static void nilfs_btree_grow(struct nilfs_btree *btree, +static void nilfs_btree_grow(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *root, *child; - int n; + int n, ncblk; root = nilfs_btree_get_root(btree); child = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(root); - nilfs_btree_node_move_right(btree, root, child, n); + nilfs_btree_node_move_right(root, child, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) @@ -832,11 +930,11 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, *ptrp = path[level].bp_newreq.bpr_ptr; } -static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, +static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path) { struct nilfs_btree_node *node; - int level; + int level, ncmax; if (path == NULL) return NILFS_BMAP_INVALID_PTR; @@ -844,29 +942,30 @@ static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, /* left sibling */ level = NILFS_BTREE_LEVEL_NODE_MIN; if (path[level].bp_index > 0) { - node = nilfs_btree_get_node(btree, path, level); - return nilfs_btree_node_get_ptr(btree, node, - path[level].bp_index - 1); + node = nilfs_btree_get_node(btree, path, level, &ncmax); + return nilfs_btree_node_get_ptr(node, + path[level].bp_index - 1, + ncmax); } /* parent */ level = NILFS_BTREE_LEVEL_NODE_MIN + 1; if (level <= nilfs_btree_height(btree) - 1) { - node = nilfs_btree_get_node(btree, path, level); - return nilfs_btree_node_get_ptr(btree, node, - path[level].bp_index); + node = nilfs_btree_get_node(btree, path, level, &ncmax); + return nilfs_btree_node_get_ptr(node, path[level].bp_index, + ncmax); } return NILFS_BMAP_INVALID_PTR; } -static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, +static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path, __u64 key) { __u64 ptr; - ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key); + ptr = nilfs_bmap_find_target_seq(btree, key); if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; @@ -877,17 +976,10 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, return ptr; } /* block group */ - return nilfs_bmap_find_target_in_group(&btree->bt_bmap); -} - -static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key, - __u64 ptr) -{ - btree->bt_bmap.b_last_allocated_key = key; - btree->bt_bmap.b_last_allocated_ptr = ptr; + return nilfs_bmap_find_target_in_group(btree); } -static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, +static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int *levelp, __u64 key, __u64 ptr, struct nilfs_bmap_stats *stats) @@ -895,79 +987,78 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; - int pindex, level, ret; + int pindex, level, ncmax, ncblk, ret; struct inode *dat = NULL; stats->bs_nblocks = 0; level = NILFS_BTREE_LEVEL_DATA; /* allocate a new ptr for data block */ - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { + if (NILFS_BMAP_USE_VBN(btree)) { path[level].bp_newreq.bpr_ptr = nilfs_btree_find_target_v(btree, path, key); - dat = nilfs_bmap_get_dat(&btree->bt_bmap); + dat = nilfs_bmap_get_dat(btree); } - ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq, dat); + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_data; + ncblk = nilfs_btree_nchildren_per_block(btree); + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); - if (nilfs_btree_node_get_nchildren(node) < - nilfs_btree_node_nchildren_max(node, btree)) { + if (nilfs_btree_node_get_nchildren(node) < ncblk) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; } - parent = nilfs_btree_get_node(btree, path, level + 1); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; /* left sibling */ if (pindex > 0) { - sibptr = nilfs_btree_node_get_ptr(btree, parent, - pindex - 1); + sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, + ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(sib) < - nilfs_btree_node_nchildren_max(sib, btree)) { + if (nilfs_btree_node_get_nchildren(sib) < ncblk) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_left; stats->bs_nblocks++; goto out; - } else + } else { brelse(bh); + } } /* right sibling */ - if (pindex < - nilfs_btree_node_get_nchildren(parent) - 1) { - sibptr = nilfs_btree_node_get_ptr(btree, parent, - pindex + 1); + if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) { + sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, + ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(sib) < - nilfs_btree_node_nchildren_max(sib, btree)) { + if (nilfs_btree_node_get_nchildren(sib) < ncblk) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_right; stats->bs_nblocks++; goto out; - } else + } else { brelse(bh); + } } /* split */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; @@ -979,9 +1070,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, stats->bs_nblocks++; - nilfs_btree_node_init(btree, - (struct nilfs_btree_node *)bh->b_data, - 0, level, 0, NULL, NULL); + sib = (struct nilfs_btree_node *)bh->b_data; + nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_split; } @@ -989,7 +1079,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* root */ node = nilfs_btree_get_root(btree); if (nilfs_btree_node_get_nchildren(node) < - nilfs_btree_node_nchildren_max(node, btree)) { + NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -997,8 +1087,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* grow */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq, dat); + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, @@ -1006,8 +1095,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; - nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, - 0, level, 0, NULL, NULL); + nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data, + 0, level, 0, ncblk, NULL, NULL); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_grow; @@ -1024,25 +1113,22 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, - dat); + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); err_out_child_node: for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { nilfs_btnode_delete(path[level].bp_sib_bh); - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq, dat); + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); } - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, - dat); + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); err_out_data: *levelp = level; stats->bs_nblocks = 0; return ret; } -static void nilfs_btree_commit_insert(struct nilfs_btree *btree, +static void nilfs_btree_commit_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int maxlevel, __u64 key, __u64 ptr) { @@ -1051,35 +1137,33 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree, set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { - nilfs_btree_set_target_v(btree, key, ptr); - dat = nilfs_bmap_get_dat(&btree->bt_bmap); + if (NILFS_BMAP_USE_VBN(btree)) { + nilfs_bmap_set_target_v(btree, key, ptr); + dat = nilfs_bmap_get_dat(btree); } for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { - nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, + nilfs_bmap_commit_alloc_ptr(btree, &path[level - 1].bp_newreq, dat); path[level].bp_op(btree, path, level, &key, &ptr); } - if (!nilfs_bmap_dirty(&btree->bt_bmap)) - nilfs_bmap_set_dirty(&btree->bt_bmap); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); } -static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; int level, ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, NULL, - NILFS_BTREE_LEVEL_NODE_MIN); + NILFS_BTREE_LEVEL_NODE_MIN, 0); if (ret != -ENOENT) { if (ret == 0) ret = -EEXIST; @@ -1090,23 +1174,25 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) if (ret < 0) goto out; nilfs_btree_commit_insert(btree, path, level, key, ptr); - nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + nilfs_bmap_add_blocks(btree, stats.bs_nblocks); out: nilfs_btree_free_path(path); return ret; } -static void nilfs_btree_do_delete(struct nilfs_btree *btree, +static void nilfs_btree_do_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; + int ncblk; if (level < nilfs_btree_height(btree) - 1) { node = nilfs_btree_get_nonroot_node(path, level); - nilfs_btree_node_delete(btree, node, keyp, ptrp, - path[level].bp_index); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_delete(node, path[level].bp_index, + keyp, ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); if (path[level].bp_index == 0) @@ -1114,17 +1200,18 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, nilfs_btree_node_get_key(node, 0)); } else { node = nilfs_btree_get_root(btree); - nilfs_btree_node_delete(btree, node, keyp, ptrp, - path[level].bp_index); + nilfs_btree_node_delete(node, path[level].bp_index, + keyp, ptrp, + NILFS_BTREE_ROOT_NCHILDREN_MAX); } } -static void nilfs_btree_borrow_left(struct nilfs_btree *btree, +static void nilfs_btree_borrow_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; - int nchildren, lnchildren, n; + int nchildren, lnchildren, n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); @@ -1132,10 +1219,11 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); lnchildren = nilfs_btree_node_get_nchildren(left); + ncblk = nilfs_btree_nchildren_per_block(btree); n = (nchildren + lnchildren) / 2 - nchildren; - nilfs_btree_node_move_right(btree, left, node, n); + nilfs_btree_node_move_right(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -1150,12 +1238,12 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, path[level].bp_index += n; } -static void nilfs_btree_borrow_right(struct nilfs_btree *btree, +static void nilfs_btree_borrow_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - int nchildren, rnchildren, n; + int nchildren, rnchildren, n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); @@ -1163,10 +1251,11 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); rnchildren = nilfs_btree_node_get_nchildren(right); + ncblk = nilfs_btree_nchildren_per_block(btree); n = (nchildren + rnchildren) / 2 - nchildren; - nilfs_btree_node_move_left(btree, node, right, n); + nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -1182,21 +1271,22 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, path[level].bp_sib_bh = NULL; } -static void nilfs_btree_concat_left(struct nilfs_btree *btree, +static void nilfs_btree_concat_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; - int n; + int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(node); - nilfs_btree_node_move_left(btree, left, node, n); + nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); @@ -1207,21 +1297,22 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, path[level].bp_index += nilfs_btree_node_get_nchildren(left); } -static void nilfs_btree_concat_right(struct nilfs_btree *btree, +static void nilfs_btree_concat_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - int n; + int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(right); - nilfs_btree_node_move_left(btree, node, right, n); + nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -1231,29 +1322,32 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, path[level + 1].bp_index++; } -static void nilfs_btree_shrink(struct nilfs_btree *btree, +static void nilfs_btree_shrink(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *root, *child; - int n; + int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); root = nilfs_btree_get_root(btree); child = nilfs_btree_get_nonroot_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); - nilfs_btree_node_delete(btree, root, NULL, NULL, 0); + nilfs_btree_node_delete(root, 0, NULL, NULL, + NILFS_BTREE_ROOT_NCHILDREN_MAX); nilfs_btree_node_set_level(root, level); n = nilfs_btree_node_get_nchildren(child); - nilfs_btree_node_move_left(btree, root, child, n); + nilfs_btree_node_move_left(root, child, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = NULL; } -static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, +static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int *levelp, struct nilfs_bmap_stats *stats, @@ -1262,42 +1356,43 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; - int pindex, level, ret; + int pindex, level, ncmin, ncmax, ncblk, ret; ret = 0; stats->bs_nblocks = 0; + ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); + ncblk = nilfs_btree_nchildren_per_block(btree); + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(btree, node, - path[level].bp_index); - ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, + nilfs_btree_node_get_ptr(node, path[level].bp_index, + ncblk); + ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; - if (nilfs_btree_node_get_nchildren(node) > - nilfs_btree_node_nchildren_min(node, btree)) { + if (nilfs_btree_node_get_nchildren(node) > ncmin) { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; goto out; } - parent = nilfs_btree_get_node(btree, path, level + 1); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; if (pindex > 0) { /* left sibling */ - sibptr = nilfs_btree_node_get_ptr(btree, parent, - pindex - 1); + sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, + ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(sib) > - nilfs_btree_node_nchildren_min(sib, btree)) { + if (nilfs_btree_node_get_nchildren(sib) > ncmin) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_left; stats->bs_nblocks++; @@ -1311,14 +1406,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, } else if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) { /* right sibling */ - sibptr = nilfs_btree_node_get_ptr(btree, parent, - pindex + 1); + sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, + ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(sib) > - nilfs_btree_node_nchildren_min(sib, btree)) { + if (nilfs_btree_node_get_nchildren(sib) > ncmin) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_right; stats->bs_nblocks++; @@ -1349,10 +1443,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); + nilfs_btree_node_get_ptr(node, path[level].bp_index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); - ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq, dat); + ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; @@ -1367,75 +1461,68 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); + nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); err_out_child_node: for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { brelse(path[level].bp_sib_bh); - nilfs_bmap_abort_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq, dat); + nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); } *levelp = level; stats->bs_nblocks = 0; return ret; } -static void nilfs_btree_commit_delete(struct nilfs_btree *btree, +static void nilfs_btree_commit_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int maxlevel, struct inode *dat) { int level; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { - nilfs_bmap_commit_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq, dat); + nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat); path[level].bp_op(btree, path, level, NULL, NULL); } - if (!nilfs_bmap_dirty(&btree->bt_bmap)) - nilfs_bmap_set_dirty(&btree->bt_bmap); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); } -static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) +static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; struct inode *dat; int level, ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, NULL, - NILFS_BTREE_LEVEL_NODE_MIN); + NILFS_BTREE_LEVEL_NODE_MIN, 0); if (ret < 0) goto out; - dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? - nilfs_bmap_get_dat(&btree->bt_bmap) : NULL; + dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); if (ret < 0) goto out; nilfs_btree_commit_delete(btree, path, level, dat); - nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + nilfs_bmap_sub_blocks(btree, stats.bs_nblocks); out: nilfs_btree_free_path(path); return ret; } -static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; int ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; @@ -1447,16 +1534,14 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) return ret; } -static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) +static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) { struct buffer_head *bh; - struct nilfs_btree *btree; struct nilfs_btree_node *root, *node; __u64 maxkey, nextmaxkey; __u64 ptr; int nchildren, ret; - btree = (struct nilfs_btree *)bmap; root = nilfs_btree_get_root(btree); switch (nilfs_btree_height(btree)) { case 2: @@ -1467,7 +1552,8 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) nchildren = nilfs_btree_node_get_nchildren(root); if (nchildren > 1) return 0; - ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) return ret; @@ -1487,32 +1573,33 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } -static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, +static int nilfs_btree_gather_data(struct nilfs_bmap *btree, __u64 *keys, __u64 *ptrs, int nitems) { struct buffer_head *bh; - struct nilfs_btree *btree; struct nilfs_btree_node *node, *root; __le64 *dkeys; __le64 *dptrs; __u64 ptr; - int nchildren, i, ret; + int nchildren, ncmax, i, ret; - btree = (struct nilfs_btree *)bmap; root = nilfs_btree_get_root(btree); switch (nilfs_btree_height(btree)) { case 2: bh = NULL; node = root; + ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX; break; case 3: nchildren = nilfs_btree_node_get_nchildren(root); WARN_ON(nchildren > 1); - ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) return ret; node = (struct nilfs_btree_node *)bh->b_data; + ncmax = nilfs_btree_nchildren_per_block(btree); break; default: node = NULL; @@ -1523,10 +1610,10 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, if (nchildren < nitems) nitems = nchildren; dkeys = nilfs_btree_node_dkeys(node); - dptrs = nilfs_btree_node_dptrs(node, btree); + dptrs = nilfs_btree_node_dptrs(node, ncmax); for (i = 0; i < nitems; i++) { - keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); - ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); + keys[i] = le64_to_cpu(dkeys[i]); + ptrs[i] = le64_to_cpu(dptrs[i]); } if (bh != NULL) @@ -1536,14 +1623,13 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, } static int -nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, +nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key, union nilfs_bmap_ptr_req *dreq, union nilfs_bmap_ptr_req *nreq, struct buffer_head **bhp, struct nilfs_bmap_stats *stats) { struct buffer_head *bh; - struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct inode *dat = NULL; int ret; @@ -1551,12 +1637,12 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* for data */ /* cannot find near ptr */ - if (NILFS_BMAP_USE_VBN(bmap)) { + if (NILFS_BMAP_USE_VBN(btree)) { dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); - dat = nilfs_bmap_get_dat(bmap); + dat = nilfs_bmap_get_dat(btree); } - ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); + ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat); if (ret < 0) return ret; @@ -1564,7 +1650,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, stats->bs_nblocks++; if (nreq != NULL) { nreq->bpr_ptr = dreq->bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); + ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat); if (ret < 0) goto err_out_dreq; @@ -1581,16 +1667,16 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* error */ err_out_nreq: - nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); + nilfs_bmap_abort_alloc_ptr(btree, nreq, dat); err_out_dreq: - nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); + nilfs_bmap_abort_alloc_ptr(btree, dreq, dat); stats->bs_nblocks = 0; return ret; } static void -nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, +nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n, @@ -1598,57 +1684,59 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *nreq, struct buffer_head *bh) { - struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct nilfs_btree_node *node; struct inode *dat; __u64 tmpptr; + int ncblk; /* free resources */ - if (bmap->b_ops->bop_clear != NULL) - bmap->b_ops->bop_clear(bmap); + if (btree->b_ops->bop_clear != NULL) + btree->b_ops->bop_clear(btree); /* ptr must be a pointer to a buffer head. */ set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); /* convert and insert */ - dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; - nilfs_btree_init(bmap); + dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; + nilfs_btree_init(btree); if (nreq != NULL) { - nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); - nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); + nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); + nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); /* create child node at level 1 */ node = (struct nilfs_btree_node *)bh->b_data; - nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); - nilfs_btree_node_insert(btree, node, - key, dreq->bpr_ptr, n); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs); + nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk); if (!buffer_dirty(bh)) nilfs_btnode_mark_dirty(bh); - if (!nilfs_bmap_dirty(bmap)) - nilfs_bmap_set_dirty(bmap); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); brelse(bh); /* create root node at level 2 */ node = nilfs_btree_get_root(btree); tmpptr = nreq->bpr_ptr; - nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, - 2, 1, &keys[0], &tmpptr); + nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX, + &keys[0], &tmpptr); } else { - nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); + nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); /* create root node at level 1 */ node = nilfs_btree_get_root(btree); - nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, - 1, n, keys, ptrs); - nilfs_btree_node_insert(btree, node, - key, dreq->bpr_ptr, n); - if (!nilfs_bmap_dirty(bmap)) - nilfs_bmap_set_dirty(bmap); + nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, + keys, ptrs); + nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); } - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr); + if (NILFS_BMAP_USE_VBN(btree)) + nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr); } /** @@ -1660,7 +1748,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, * @ptrs: * @n: */ -int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, +int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n) { @@ -1673,7 +1761,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, di = &dreq; ni = NULL; } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( - 1 << bmap->b_inode->i_blkbits)) { + 1 << btree->b_inode->i_blkbits)) { di = &dreq; ni = &nreq; } else { @@ -1682,17 +1770,17 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, BUG(); } - ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh, + ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh, &stats); if (ret < 0) return ret; - nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, + nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n, di, ni, bh); - nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + nilfs_bmap_add_blocks(btree, stats.bs_nblocks); return 0; } -static int nilfs_btree_propagate_p(struct nilfs_btree *btree, +static int nilfs_btree_propagate_p(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head *bh) @@ -1704,17 +1792,17 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree, return 0; } -static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, +static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { struct nilfs_btree_node *parent; - int ret; + int ncmax, ret; - parent = nilfs_btree_get_node(btree, path, level + 1); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(btree, parent, - path[level + 1].bp_index); + nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, &path[level].bp_newreq.bpr_req); @@ -1726,7 +1814,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; path[level].bp_ctxt.bh = path[level].bp_bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) { nilfs_dat_abort_update(dat, @@ -1739,30 +1827,31 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, return 0; } -static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, +static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { struct nilfs_btree_node *parent; + int ncmax; nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, &path[level].bp_newreq.bpr_req, - btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); + btree->b_ptr_type == NILFS_BMAP_PTR_VS); if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); path[level].bp_bh = path[level].bp_ctxt.bh; } set_buffer_nilfs_volatile(path[level].bp_bh); - parent = nilfs_btree_get_node(btree, path, level + 1); - nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, - path[level].bp_newreq.bpr_ptr); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, + path[level].bp_newreq.bpr_ptr, ncmax); } -static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, +static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { @@ -1770,11 +1859,11 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); } -static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, +static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int minlevel, int *maxlevelp, struct inode *dat) @@ -1809,7 +1898,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, return ret; } -static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, +static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int minlevel, int maxlevel, struct buffer_head *bh, @@ -1824,14 +1913,15 @@ static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, nilfs_btree_commit_update_v(btree, path, level, dat); } -static int nilfs_btree_propagate_v(struct nilfs_btree *btree, +static int nilfs_btree_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head *bh) { int maxlevel = 0, ret; struct nilfs_btree_node *parent; - struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); + struct inode *dat = nilfs_bmap_get_dat(btree); __u64 ptr; + int ncmax; get_bh(bh); path[level].bp_bh = bh; @@ -1841,9 +1931,10 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, goto out; if (buffer_nilfs_volatile(path[level].bp_bh)) { - parent = nilfs_btree_get_node(btree, path, level + 1); - ptr = nilfs_btree_node_get_ptr(btree, parent, - path[level + 1].bp_index); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, + path[level + 1].bp_index, + ncmax); ret = nilfs_dat_mark_dirty(dat, ptr); if (ret < 0) goto out; @@ -1857,10 +1948,9 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, return ret; } -static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, +static int nilfs_btree_propagate(struct nilfs_bmap *btree, struct buffer_head *bh) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_btree_node *node; __u64 key; @@ -1868,7 +1958,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, WARN_ON(!buffer_dirty(bh)); - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; @@ -1878,11 +1967,11 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); } else { - key = nilfs_bmap_data_get_key(bmap, bh); + key = nilfs_bmap_data_get_key(btree, bh); level = NILFS_BTREE_LEVEL_DATA; } - ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { if (unlikely(ret == -ENOENT)) printk(KERN_CRIT "%s: key = %llu, level == %d\n", @@ -1890,7 +1979,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, goto out; } - ret = NILFS_BMAP_USE_VBN(bmap) ? + ret = NILFS_BMAP_USE_VBN(btree) ? nilfs_btree_propagate_v(btree, path, level, bh) : nilfs_btree_propagate_p(btree, path, level, bh); @@ -1900,13 +1989,13 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, return ret; } -static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, +static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree, struct buffer_head *bh) { - return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr); } -static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, +static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, struct list_head *lists, struct buffer_head *bh) { @@ -1920,6 +2009,18 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, node = (struct nilfs_btree_node *)bh->b_data; key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); + if (level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX) { + dump_stack(); + printk(KERN_WARNING + "%s: invalid btree level: %d (key=%llu, ino=%lu, " + "blocknr=%llu)\n", + __func__, level, (unsigned long long)key, + NILFS_BMAP_I(btree)->vfs_inode.i_ino, + (unsigned long long)bh->b_blocknr); + return; + } + list_for_each(head, &lists[level]) { cbh = list_entry(head, struct buffer_head, b_assoc_buffers); cnode = (struct nilfs_btree_node *)cbh->b_data; @@ -1930,11 +2031,10 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, list_add_tail(&bh->b_assoc_buffers, head); } -static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, +static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct list_head *listp) { - struct nilfs_btree *btree = (struct nilfs_btree *)bmap; - struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache; + struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; struct pagevec pvec; struct buffer_head *bh, *head; @@ -1968,7 +2068,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, list_splice_tail(&lists[level], listp); } -static int nilfs_btree_assign_p(struct nilfs_btree *btree, +static int nilfs_btree_assign_p(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head **bh, @@ -1978,38 +2078,38 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree, struct nilfs_btree_node *parent; __u64 key; __u64 ptr; - int ret; + int ncmax, ret; - parent = nilfs_btree_get_node(btree, path, level + 1); - ptr = nilfs_btree_node_get_ptr(btree, parent, - path[level + 1].bp_index); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); if (buffer_nilfs_node(*bh)) { path[level].bp_ctxt.oldkey = ptr; path[level].bp_ctxt.newkey = blocknr; path[level].bp_ctxt.bh = *bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) return ret; nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); *bh = path[level].bp_ctxt.bh; } - nilfs_btree_node_set_ptr(btree, parent, - path[level + 1].bp_index, blocknr); + nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr, + ncmax); key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ - binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = level; return 0; } -static int nilfs_btree_assign_v(struct nilfs_btree *btree, +static int nilfs_btree_assign_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head **bh, @@ -2017,15 +2117,15 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, union nilfs_binfo *binfo) { struct nilfs_btree_node *parent; - struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); + struct inode *dat = nilfs_bmap_get_dat(btree); __u64 key; __u64 ptr; union nilfs_bmap_ptr_req req; - int ret; + int ncmax, ret; - parent = nilfs_btree_get_node(btree, path, level + 1); - ptr = nilfs_btree_node_get_ptr(btree, parent, - path[level + 1].bp_index); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); req.bpr_ptr = ptr; ret = nilfs_dat_prepare_start(dat, &req.bpr_req); if (ret < 0) @@ -2034,24 +2134,22 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ - binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); return 0; } -static int nilfs_btree_assign(struct nilfs_bmap *bmap, +static int nilfs_btree_assign(struct nilfs_bmap *btree, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_btree_node *node; __u64 key; int level, ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; @@ -2061,17 +2159,17 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); } else { - key = nilfs_bmap_data_get_key(bmap, *bh); + key = nilfs_bmap_data_get_key(btree, *bh); level = NILFS_BTREE_LEVEL_DATA; } - ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { WARN_ON(ret == -ENOENT); goto out; } - ret = NILFS_BMAP_USE_VBN(bmap) ? + ret = NILFS_BMAP_USE_VBN(btree) ? nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); @@ -2081,7 +2179,7 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, return ret; } -static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, +static int nilfs_btree_assign_gc(struct nilfs_bmap *btree, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) @@ -2090,7 +2188,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, __u64 key; int ret; - ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, + ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr, blocknr); if (ret < 0) return ret; @@ -2099,29 +2197,27 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, node = (struct nilfs_btree_node *)(*bh)->b_data; key = nilfs_btree_node_get_key(node, 0); } else - key = nilfs_bmap_data_get_key(bmap, *bh); + key = nilfs_bmap_data_get_key(btree, *bh); /* on-disk format */ binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); return 0; } -static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) +static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level) { struct buffer_head *bh; - struct nilfs_btree *btree; struct nilfs_btree_path *path; __u64 ptr; int ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0); if (ret < 0) { WARN_ON(ret == -ENOENT); goto out; @@ -2135,8 +2231,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) if (!buffer_dirty(bh)) nilfs_btnode_mark_dirty(bh); brelse(bh); - if (!nilfs_bmap_dirty(&btree->bt_bmap)) - nilfs_bmap_set_dirty(&btree->bt_bmap); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); out: nilfs_btree_free_path(path); @@ -2186,10 +2282,14 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { int nilfs_btree_init(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops; + bmap->b_nchildren_per_block = + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); return 0; } void nilfs_btree_init_gc(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops_gc; + bmap->b_nchildren_per_block = + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 43c8c5b..22c02e3 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -31,14 +31,6 @@ #include "bmap.h" /** - * struct nilfs_btree - B-tree structure - * @bt_bmap: bmap base structure - */ -struct nilfs_btree { - struct nilfs_bmap bt_bmap; -}; - -/** * struct nilfs_btree_path - A path on which B-tree operations are executed * @bp_bh: buffer head of node block * @bp_sib_bh: buffer head of sibling node block @@ -54,7 +46,7 @@ struct nilfs_btree_path { union nilfs_bmap_ptr_req bp_oldreq; union nilfs_bmap_ptr_req bp_newreq; struct nilfs_btnode_chkey_ctxt bp_ctxt; - void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, + void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *, int, __u64 *, __u64 *); }; @@ -80,4 +72,6 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, const __u64 *, const __u64 *, int); void nilfs_btree_init_gc(struct nilfs_bmap *); +int nilfs_btree_broken_node_block(struct buffer_head *bh); + #endif /* _NILFS_BTREE_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 85c89df..b60277b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -141,7 +141,7 @@ static void nilfs_check_page(struct page *page) } for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { p = (struct nilfs_dir_entry *)(kaddr + offs); - rec_len = le16_to_cpu(p->rec_len); + rec_len = nilfs_rec_len_from_disk(p->rec_len); if (rec_len < NILFS_DIR_REC_LEN(1)) goto Eshort; @@ -199,13 +199,10 @@ fail: static struct page *nilfs_get_page(struct inode *dir, unsigned long n) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_cache_page(mapping, n, - (filler_t *)mapping->a_ops->readpage, NULL); + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) { - wait_on_page_locked(page); kmap(page); - if (!PageUptodate(page)) - goto fail; if (!PageChecked(page)) nilfs_check_page(page); if (PageError(page)) @@ -238,7 +235,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de) */ static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) { - return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); + return (struct nilfs_dir_entry *)((char *)p + + nilfs_rec_len_from_disk(p->rec_len)); } static unsigned char @@ -329,7 +327,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) goto success; } } - filp->f_pos += le16_to_cpu(de->rec_len); + filp->f_pos += nilfs_rec_len_from_disk(de->rec_len); } nilfs_put_page(page); } @@ -444,7 +442,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, struct page *page, struct inode *inode) { unsigned from = (char *) de - (char *) page_address(page); - unsigned to = from + le16_to_cpu(de->rec_len); + unsigned to = from + nilfs_rec_len_from_disk(de->rec_len); struct address_space *mapping = page->mapping; int err; @@ -500,7 +498,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) /* We hit i_size */ name_len = 0; rec_len = chunk_size; - de->rec_len = cpu_to_le16(chunk_size); + de->rec_len = nilfs_rec_len_to_disk(chunk_size); de->inode = 0; goto got_it; } @@ -514,7 +512,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) if (nilfs_match(namelen, name, de)) goto out_unlock; name_len = NILFS_DIR_REC_LEN(de->name_len); - rec_len = le16_to_cpu(de->rec_len); + rec_len = nilfs_rec_len_from_disk(de->rec_len); if (!de->inode && rec_len >= reclen) goto got_it; if (rec_len >= name_len + reclen) @@ -537,8 +535,8 @@ got_it: struct nilfs_dir_entry *de1; de1 = (struct nilfs_dir_entry *)((char *)de + name_len); - de1->rec_len = cpu_to_le16(rec_len - name_len); - de->rec_len = cpu_to_le16(name_len); + de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len); + de->rec_len = nilfs_rec_len_to_disk(name_len); de = de1; } de->name_len = namelen; @@ -569,7 +567,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) struct inode *inode = mapping->host; char *kaddr = page_address(page); unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); - unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + unsigned to = ((char *)dir - kaddr) + + nilfs_rec_len_from_disk(dir->rec_len); struct nilfs_dir_entry *pde = NULL; struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); int err; @@ -590,7 +589,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) err = nilfs_prepare_chunk(page, mapping, from, to); BUG_ON(err); if (pde) - pde->rec_len = cpu_to_le16(to - from); + pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; nilfs_commit_chunk(page, mapping, from, to); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -624,14 +623,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) memset(kaddr, 0, chunk_size); de = (struct nilfs_dir_entry *)kaddr; de->name_len = 1; - de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1)); + de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1)); memcpy(de->name, ".\0\0", 4); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); de->name_len = 2; - de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1)); + de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1)); de->inode = cpu_to_le64(parent->i_ino); memcpy(de->name, "..\0", 4); nilfs_set_de_type(de, inode); diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 236753d..324d80c 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -27,47 +27,43 @@ #include "alloc.h" #include "dat.h" -static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) +static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct) { return (__le64 *) - ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1); + ((struct nilfs_direct_node *)direct->b_u.u_data + 1); } static inline __u64 -nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key) +nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key) { - return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key)); + return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key)); } -static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct, +static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct, __u64 key, __u64 ptr) { - *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr); + *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr); } -static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, +static int nilfs_direct_lookup(const struct nilfs_bmap *direct, __u64 key, int level, __u64 *ptrp) { - struct nilfs_direct *direct; __u64 ptr; - direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */ if (key > NILFS_DIRECT_KEY_MAX || level != 1) return -ENOENT; ptr = nilfs_direct_get_ptr(direct, key); if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; - if (ptrp != NULL) - *ptrp = ptr; + *ptrp = ptr; return 0; } -static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, +static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, __u64 key, __u64 *ptrp, unsigned maxblocks) { - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; struct inode *dat = NULL; __u64 ptr, ptr2; sector_t blocknr; @@ -79,8 +75,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; - if (NILFS_BMAP_USE_VBN(bmap)) { - dat = nilfs_bmap_get_dat(bmap); + if (NILFS_BMAP_USE_VBN(direct)) { + dat = nilfs_bmap_get_dat(direct); ret = nilfs_dat_translate(dat, ptr, &blocknr); if (ret < 0) return ret; @@ -106,29 +102,21 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, } static __u64 -nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) +nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key) { __u64 ptr; - ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key); + ptr = nilfs_bmap_find_target_seq(direct, key); if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; else /* block group */ - return nilfs_bmap_find_target_in_group(&direct->d_bmap); -} - -static void nilfs_direct_set_target_v(struct nilfs_direct *direct, - __u64 key, __u64 ptr) -{ - direct->d_bmap.b_last_allocated_key = key; - direct->d_bmap.b_last_allocated_ptr = ptr; + return nilfs_bmap_find_target_in_group(direct); } static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; struct inode *dat = NULL; struct buffer_head *bh; @@ -136,11 +124,11 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) if (key > NILFS_DIRECT_KEY_MAX) return -ENOENT; - if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) + if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR) return -EEXIST; if (NILFS_BMAP_USE_VBN(bmap)) { - req.bpr_ptr = nilfs_direct_find_target_v(direct, key); + req.bpr_ptr = nilfs_direct_find_target_v(bmap, key); dat = nilfs_bmap_get_dat(bmap); } ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); @@ -150,13 +138,13 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) set_buffer_nilfs_volatile(bh); nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); - nilfs_direct_set_ptr(direct, key, req.bpr_ptr); + nilfs_direct_set_ptr(bmap, key, req.bpr_ptr); if (!nilfs_bmap_dirty(bmap)) nilfs_bmap_set_dirty(bmap); if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_direct_set_target_v(direct, key, req.bpr_ptr); + nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr); nilfs_bmap_add_blocks(bmap, 1); } @@ -165,33 +153,30 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) { - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; struct inode *dat; int ret; if (key > NILFS_DIRECT_KEY_MAX || - nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) + nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR) return -ENOENT; dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; - req.bpr_ptr = nilfs_direct_get_ptr(direct, key); + req.bpr_ptr = nilfs_direct_get_ptr(bmap, key); ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); if (!ret) { nilfs_bmap_commit_end_ptr(bmap, &req, dat); - nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR); nilfs_bmap_sub_blocks(bmap, 1); } return ret; } -static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp) { - struct nilfs_direct *direct; __u64 key, lastkey; - direct = (struct nilfs_direct *)bmap; lastkey = NILFS_DIRECT_KEY_MAX + 1; for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) if (nilfs_direct_get_ptr(direct, key) != @@ -211,15 +196,13 @@ static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key) return key > NILFS_DIRECT_KEY_MAX; } -static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, +static int nilfs_direct_gather_data(struct nilfs_bmap *direct, __u64 *keys, __u64 *ptrs, int nitems) { - struct nilfs_direct *direct; __u64 key; __u64 ptr; int n; - direct = (struct nilfs_direct *)bmap; if (nitems > NILFS_DIRECT_NBLOCKS) nitems = NILFS_DIRECT_NBLOCKS; n = 0; @@ -237,7 +220,6 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, __u64 key, __u64 *keys, __u64 *ptrs, int n) { - struct nilfs_direct *direct; __le64 *dptrs; int ret, i, j; @@ -253,12 +235,11 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, bmap->b_ops->bop_clear(bmap); /* convert */ - direct = (struct nilfs_direct *)bmap; - dptrs = nilfs_direct_dptrs(direct); + dptrs = nilfs_direct_dptrs(bmap); for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { if ((j < n) && (i == keys[j])) { dptrs[i] = (i != key) ? - nilfs_bmap_ptr_to_dptr(ptrs[j]) : + cpu_to_le64(ptrs[j]) : NILFS_BMAP_INVALID_PTR; j++; } else @@ -269,10 +250,9 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, return 0; } -static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, +static int nilfs_direct_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) { - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; struct nilfs_palloc_req oldreq, newreq; struct inode *dat; __u64 key; @@ -284,7 +264,7 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, dat = nilfs_bmap_get_dat(bmap); key = nilfs_bmap_data_get_key(bmap, bh); - ptr = nilfs_direct_get_ptr(direct, key); + ptr = nilfs_direct_get_ptr(bmap, key); if (!buffer_nilfs_volatile(bh)) { oldreq.pr_entry_nr = ptr; newreq.pr_entry_nr = ptr; @@ -294,20 +274,20 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, nilfs_dat_commit_update(dat, &oldreq, &newreq, bmap->b_ptr_type == NILFS_BMAP_PTR_VS); set_buffer_nilfs_volatile(bh); - nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); + nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr); } else ret = nilfs_dat_mark_dirty(dat, ptr); return ret; } -static int nilfs_direct_assign_v(struct nilfs_direct *direct, +static int nilfs_direct_assign_v(struct nilfs_bmap *direct, __u64 key, __u64 ptr, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { - struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); + struct inode *dat = nilfs_bmap_get_dat(direct); union nilfs_bmap_ptr_req req; int ret; @@ -315,13 +295,13 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct, ret = nilfs_dat_prepare_start(dat, &req.bpr_req); if (!ret) { nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); - binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); } return ret; } -static int nilfs_direct_assign_p(struct nilfs_direct *direct, +static int nilfs_direct_assign_p(struct nilfs_bmap *direct, __u64 key, __u64 ptr, struct buffer_head **bh, sector_t blocknr, @@ -329,7 +309,7 @@ static int nilfs_direct_assign_p(struct nilfs_direct *direct, { nilfs_direct_set_ptr(direct, key, blocknr); - binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = 0; return 0; @@ -340,18 +320,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, sector_t blocknr, union nilfs_binfo *binfo) { - struct nilfs_direct *direct; __u64 key; __u64 ptr; - direct = (struct nilfs_direct *)bmap; key = nilfs_bmap_data_get_key(bmap, *bh); if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, (unsigned long long)key); return -EINVAL; } - ptr = nilfs_direct_get_ptr(direct, key); + ptr = nilfs_direct_get_ptr(bmap, key); if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, (unsigned long long)ptr); @@ -359,8 +337,8 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, } return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) : - nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo); + nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) : + nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo); } static const struct nilfs_bmap_operations nilfs_direct_ops = { diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index a5ffd66..dc643de 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -28,8 +28,6 @@ #include "bmap.h" -struct nilfs_direct; - /** * struct nilfs_direct_node - direct node * @dn_flags: flags @@ -40,15 +38,6 @@ struct nilfs_direct_node { __u8 pad[7]; }; -/** - * struct nilfs_direct - direct mapping - * @d_bmap: bmap structure - */ -struct nilfs_direct { - struct nilfs_bmap d_bmap; -}; - - #define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) #define NILFS_DIRECT_KEY_MIN 0 #define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 145f03c..bed3a78 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -48,6 +48,8 @@ #include <linux/slab.h> #include <linux/swap.h> #include "nilfs.h" +#include "btree.h" +#include "btnode.h" #include "page.h" #include "mdt.h" #include "dat.h" @@ -149,8 +151,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) { - int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, - vbn ? : pbn, pbn, out_bh); + int ret; + + ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + vbn ? : pbn, pbn, READ, out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ ret = 0; return ret; @@ -164,10 +168,15 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) if (buffer_dirty(bh)) return -EEXIST; - if (buffer_nilfs_node(bh)) + if (buffer_nilfs_node(bh)) { + if (nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + return -EIO; + } nilfs_btnode_mark_dirty(bh); - else + } else { nilfs_mdt_mark_buffer_dirty(bh); + } return 0; } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 024be8c..d01aff4 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -28,6 +28,7 @@ #include <linux/swap.h> #include <linux/slab.h> #include "nilfs.h" +#include "btnode.h" #include "segment.h" #include "page.h" #include "mdt.h" diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 47d6d79..0842d77 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -32,7 +32,6 @@ #include "the_nilfs.h" #include "sb.h" #include "bmap.h" -#include "bmap_union.h" /* * nilfs inode data in memory @@ -41,7 +40,7 @@ struct nilfs_inode_info { __u32 i_flags; unsigned long i_state; /* Dynamic state flags */ struct nilfs_bmap *i_bmap; - union nilfs_bmap_union i_bmap_union; + struct nilfs_bmap i_bmap_data; __u64 i_xattr; /* sector_t ??? */ __u32 i_dir_start_lookup; __u64 i_cno; /* check point number for GC inode */ @@ -71,9 +70,7 @@ static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode) static inline struct nilfs_inode_info * NILFS_BMAP_I(const struct nilfs_bmap *bmap) { - return container_of((union nilfs_bmap_union *)bmap, - struct nilfs_inode_info, - i_bmap_union); + return container_of(bmap, struct nilfs_inode_info, i_bmap_data); } static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) @@ -107,6 +104,14 @@ enum { }; /* + * commit flags for nilfs_commit_super and nilfs_sync_super + */ +enum { + NILFS_SB_COMMIT = 0, /* Commit a super block alternately */ + NILFS_SB_COMMIT_ALL /* Commit both super blocks */ +}; + +/* * Macros to check inode numbers */ #define NILFS_MDT_INO_BITS \ @@ -270,7 +275,14 @@ extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); extern int nilfs_store_magic_and_option(struct super_block *, struct nilfs_super_block *, char *); +extern int nilfs_check_feature_compatibility(struct super_block *, + struct nilfs_super_block *); +extern void nilfs_set_log_cursor(struct nilfs_super_block *, + struct the_nilfs *); +extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *, + int flip); extern int nilfs_commit_super(struct nilfs_sb_info *, int); +extern int nilfs_cleanup_super(struct nilfs_sb_info *); extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 8de3e1e..aab11db 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -37,7 +37,8 @@ #define NILFS_BUFFER_INHERENT_BITS \ ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ - (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated)) + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \ + (1UL << BH_NILFS_Checked)) static struct buffer_head * __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, @@ -129,6 +130,7 @@ void nilfs_forget_buffer(struct buffer_head *bh) lock_buffer(bh); clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); clear_buffer_dirty(bh); if (nilfs_page_buffers_clean(page)) __nilfs_clear_page_dirty(page); @@ -480,6 +482,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) lock_buffer(bh); clear_buffer_dirty(bh); clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); clear_buffer_uptodate(bh); clear_buffer_mapped(bh); unlock_buffer(bh); diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 8abca4d..f53d8da 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -34,11 +34,13 @@ enum { BH_NILFS_Allocated = BH_PrivateStart, BH_NILFS_Node, BH_NILFS_Volatile, + BH_NILFS_Checked, }; BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ BUFFER_FNS(NILFS_Volatile, nilfs_volatile) +BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ void nilfs_mark_buffer_dirty(struct buffer_head *bh); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index bae2a51..83e3d8c 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -91,27 +91,9 @@ static int nilfs_warn_segment_error(int err) return -EINVAL; } -static void store_segsum_info(struct nilfs_segsum_info *ssi, - struct nilfs_segment_summary *sum, - unsigned int blocksize) -{ - ssi->flags = le16_to_cpu(sum->ss_flags); - ssi->seg_seq = le64_to_cpu(sum->ss_seq); - ssi->ctime = le64_to_cpu(sum->ss_create); - ssi->next = le64_to_cpu(sum->ss_next); - ssi->nblocks = le32_to_cpu(sum->ss_nblocks); - ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo); - ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes); - - ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); - ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); - - /* need to verify ->ss_bytes field if read ->ss_cno */ -} - /** - * calc_crc_cont - check CRC of blocks continuously - * @sbi: nilfs_sb_info + * nilfs_compute_checksum - compute checksum of blocks continuously + * @nilfs: nilfs object * @bhs: buffer head of start block * @sum: place to store result * @offset: offset bytes in the first block @@ -119,23 +101,25 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi, * @start: DBN of start block * @nblock: number of blocks to be checked */ -static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, - u32 *sum, unsigned long offset, u64 check_bytes, - sector_t start, unsigned long nblock) +static int nilfs_compute_checksum(struct the_nilfs *nilfs, + struct buffer_head *bhs, u32 *sum, + unsigned long offset, u64 check_bytes, + sector_t start, unsigned long nblock) { - unsigned long blocksize = sbi->s_super->s_blocksize; + unsigned int blocksize = nilfs->ns_blocksize; unsigned long size; u32 crc; BUG_ON(offset >= blocksize); check_bytes -= offset; size = min_t(u64, check_bytes, blocksize - offset); - crc = crc32_le(sbi->s_nilfs->ns_crc_seed, + crc = crc32_le(nilfs->ns_crc_seed, (unsigned char *)bhs->b_data + offset, size); if (--nblock > 0) { do { - struct buffer_head *bh - = sb_bread(sbi->s_super, ++start); + struct buffer_head *bh; + + bh = __bread(nilfs->ns_bdev, ++start, blocksize); if (!bh) return -EIO; check_bytes -= size; @@ -150,12 +134,12 @@ static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, /** * nilfs_read_super_root_block - read super root block - * @sb: super_block + * @nilfs: nilfs object * @sr_block: disk block number of the super root block * @pbh: address of a buffer_head pointer to return super root buffer * @check: CRC check flag */ -int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, +int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, struct buffer_head **pbh, int check) { struct buffer_head *bh_sr; @@ -164,7 +148,7 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, int ret; *pbh = NULL; - bh_sr = sb_bread(sb, sr_block); + bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize); if (unlikely(!bh_sr)) { ret = NILFS_SEG_FAIL_IO; goto failed; @@ -174,12 +158,13 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, if (check) { unsigned bytes = le16_to_cpu(sr->sr_bytes); - if (bytes == 0 || bytes > sb->s_blocksize) { + if (bytes == 0 || bytes > nilfs->ns_blocksize) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } - if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc, - sizeof(sr->sr_sum), bytes, sr_block, 1)) { + if (nilfs_compute_checksum( + nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes, + sr_block, 1)) { ret = NILFS_SEG_FAIL_IO; goto failed_bh; } @@ -199,64 +184,76 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, } /** - * load_segment_summary - read segment summary of the specified partial segment - * @sbi: nilfs_sb_info - * @pseg_start: start disk block number of partial segment - * @seg_seq: sequence number requested - * @ssi: pointer to nilfs_segsum_info struct to store information + * nilfs_read_log_header - read summary header of the specified log + * @nilfs: nilfs object + * @start_blocknr: start block number of the log + * @sum: pointer to return segment summary structure */ -static int -load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, - u64 seg_seq, struct nilfs_segsum_info *ssi) +static struct buffer_head * +nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, + struct nilfs_segment_summary **sum) { struct buffer_head *bh_sum; - struct nilfs_segment_summary *sum; + + bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); + if (bh_sum) + *sum = (struct nilfs_segment_summary *)bh_sum->b_data; + return bh_sum; +} + +/** + * nilfs_validate_log - verify consistency of log + * @nilfs: nilfs object + * @seg_seq: sequence number of segment + * @bh_sum: buffer head of summary block + * @sum: segment summary struct + */ +static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, + struct buffer_head *bh_sum, + struct nilfs_segment_summary *sum) +{ unsigned long nblock; u32 crc; - int ret = NILFS_SEG_FAIL_IO; + int ret; - bh_sum = sb_bread(sbi->s_super, pseg_start); - if (!bh_sum) + ret = NILFS_SEG_FAIL_MAGIC; + if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) goto out; - sum = (struct nilfs_segment_summary *)bh_sum->b_data; - - /* Check consistency of segment summary */ - if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) { - ret = NILFS_SEG_FAIL_MAGIC; - goto failed; - } - store_segsum_info(ssi, sum, sbi->s_super->s_blocksize); - if (seg_seq != ssi->seg_seq) { - ret = NILFS_SEG_FAIL_SEQ; - goto failed; - } + ret = NILFS_SEG_FAIL_SEQ; + if (le64_to_cpu(sum->ss_seq) != seg_seq) + goto out; - nblock = ssi->nblocks; - if (unlikely(nblock == 0 || - nblock > sbi->s_nilfs->ns_blocks_per_segment)) { + nblock = le32_to_cpu(sum->ss_nblocks); + ret = NILFS_SEG_FAIL_CONSISTENCY; + if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment)) /* This limits the number of blocks read in the CRC check */ - ret = NILFS_SEG_FAIL_CONSISTENCY; - goto failed; - } - if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum), - ((u64)nblock << sbi->s_super->s_blocksize_bits), - pseg_start, nblock)) { - ret = NILFS_SEG_FAIL_IO; - goto failed; - } - if (crc == le32_to_cpu(sum->ss_datasum)) - ret = 0; - else - ret = NILFS_SEG_FAIL_CHECKSUM_FULL; - failed: - brelse(bh_sum); - out: + goto out; + + ret = NILFS_SEG_FAIL_IO; + if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum), + ((u64)nblock << nilfs->ns_blocksize_bits), + bh_sum->b_blocknr, nblock)) + goto out; + + ret = NILFS_SEG_FAIL_CHECKSUM_FULL; + if (crc != le32_to_cpu(sum->ss_datasum)) + goto out; + ret = 0; +out: return ret; } -static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, - unsigned int *offset, unsigned int bytes) +/** + * nilfs_read_summary_info - read an item on summary blocks of a log + * @nilfs: nilfs object + * @pbh: the current buffer head on summary blocks [in, out] + * @offset: the current byte offset on summary blocks [in, out] + * @bytes: byte size of the item to be read + */ +static void *nilfs_read_summary_info(struct the_nilfs *nilfs, + struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes) { void *ptr; sector_t blocknr; @@ -265,7 +262,8 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, if (bytes > (*pbh)->b_size - *offset) { blocknr = (*pbh)->b_blocknr; brelse(*pbh); - *pbh = sb_bread(sb, blocknr + 1); + *pbh = __bread(nilfs->ns_bdev, blocknr + 1, + nilfs->ns_blocksize); if (unlikely(!*pbh)) return NULL; *offset = 0; @@ -275,9 +273,18 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, return ptr; } -static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, - unsigned int *offset, unsigned int bytes, - unsigned long count) +/** + * nilfs_skip_summary_info - skip items on summary blocks of a log + * @nilfs: nilfs object + * @pbh: the current buffer head on summary blocks [in, out] + * @offset: the current byte offset on summary blocks [in, out] + * @bytes: byte size of the item to be skipped + * @count: number of items to be skipped + */ +static void nilfs_skip_summary_info(struct the_nilfs *nilfs, + struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes, + unsigned long count) { unsigned int rest_item_in_current_block = ((*pbh)->b_size - *offset) / bytes; @@ -294,36 +301,46 @@ static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, *offset = bytes * (count - (bcnt - 1) * nitem_per_block); brelse(*pbh); - *pbh = sb_bread(sb, blocknr + bcnt); + *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt, + nilfs->ns_blocksize); } } -static int -collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, - struct nilfs_segsum_info *ssi, - struct list_head *head) +/** + * nilfs_scan_dsync_log - get block information of a log written for data sync + * @nilfs: nilfs object + * @start_blocknr: start block number of the log + * @sum: log summary information + * @head: list head to add nilfs_recovery_block struct + */ +static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, + struct nilfs_segment_summary *sum, + struct list_head *head) { struct buffer_head *bh; unsigned int offset; - unsigned long nfinfo = ssi->nfinfo; - sector_t blocknr = sum_blocknr + ssi->nsumblk; + u32 nfinfo, sumbytes; + sector_t blocknr; ino_t ino; int err = -EIO; + nfinfo = le32_to_cpu(sum->ss_nfinfo); if (!nfinfo) return 0; - bh = sb_bread(sbi->s_super, sum_blocknr); + sumbytes = le32_to_cpu(sum->ss_sumbytes); + blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize); + bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (unlikely(!bh)) goto out; - offset = le16_to_cpu( - ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes); + offset = le16_to_cpu(sum->ss_bytes); for (;;) { unsigned long nblocks, ndatablk, nnodeblk; struct nilfs_finfo *finfo; - finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo)); + finfo = nilfs_read_summary_info(nilfs, &bh, &offset, + sizeof(*finfo)); if (unlikely(!finfo)) goto out; @@ -336,8 +353,8 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, struct nilfs_recovery_block *rb; struct nilfs_binfo_v *binfo; - binfo = segsum_get(sbi->s_super, &bh, &offset, - sizeof(*binfo)); + binfo = nilfs_read_summary_info(nilfs, &bh, &offset, + sizeof(*binfo)); if (unlikely(!binfo)) goto out; @@ -355,9 +372,9 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, } if (--nfinfo == 0) break; - blocknr += nnodeblk; /* always 0 for the data sync segments */ - segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64), - nnodeblk); + blocknr += nnodeblk; /* always 0 for data sync logs */ + nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64), + nnodeblk); if (unlikely(!bh)) goto out; } @@ -467,14 +484,14 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, return err; } -static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, +static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, struct page *page) { struct buffer_head *bh_org; void *kaddr; - bh_org = sb_bread(sbi->s_super, rb->blocknr); + bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); if (unlikely(!bh_org)) return -EIO; @@ -485,13 +502,14 @@ static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, return 0; } -static int recover_dsync_blocks(struct nilfs_sb_info *sbi, - struct list_head *head, - unsigned long *nr_salvaged_blocks) +static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct list_head *head, + unsigned long *nr_salvaged_blocks) { struct inode *inode; struct nilfs_recovery_block *rb, *n; - unsigned blocksize = sbi->s_super->s_blocksize; + unsigned blocksize = nilfs->ns_blocksize; struct page *page; loff_t pos; int err = 0, err2 = 0; @@ -511,7 +529,7 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi, if (unlikely(err)) goto failed_inode; - err = nilfs_recovery_copy_block(sbi, rb, page); + err = nilfs_recovery_copy_block(nilfs, rb, page); if (unlikely(err)) goto failed_page; @@ -551,18 +569,20 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi, /** * nilfs_do_roll_forward - salvage logical segments newer than the latest * checkpoint + * @nilfs: nilfs object * @sbi: nilfs_sb_info - * @nilfs: the_nilfs * @ri: pointer to a nilfs_recovery_info */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, struct nilfs_recovery_info *ri) { - struct nilfs_segsum_info ssi; + struct buffer_head *bh_sum = NULL; + struct nilfs_segment_summary *sum; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; + unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; int empty_seg = 0; @@ -581,8 +601,14 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { + brelse(bh_sum); + bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); + if (!bh_sum) { + err = -EIO; + goto failed; + } - ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); + ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) { err = -EIO; @@ -590,33 +616,38 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, } goto strayed; } - if (unlikely(NILFS_SEG_HAS_SR(&ssi))) + + flags = le16_to_cpu(sum->ss_flags); + if (flags & NILFS_SS_SR) goto confused; /* Found a valid partial segment; do recovery actions */ - nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + nextnum = nilfs_get_segnum_of_block(nilfs, + le64_to_cpu(sum->ss_next)); empty_seg = 0; - nilfs->ns_ctime = ssi.ctime; - if (!(ssi.flags & NILFS_SS_GC)) - nilfs->ns_nongc_ctime = ssi.ctime; + nilfs->ns_ctime = le64_to_cpu(sum->ss_create); + if (!(flags & NILFS_SS_GC)) + nilfs->ns_nongc_ctime = nilfs->ns_ctime; switch (state) { case RF_INIT_ST: - if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi)) + if (!(flags & NILFS_SS_LOGBGN) || + !(flags & NILFS_SS_SYNDT)) goto try_next_pseg; state = RF_DSYNC_ST; /* Fall through */ case RF_DSYNC_ST: - if (!NILFS_SEG_DSYNC(&ssi)) + if (!(flags & NILFS_SS_SYNDT)) goto confused; - err = collect_blocks_from_segsum( - sbi, pseg_start, &ssi, &dsync_blocks); + err = nilfs_scan_dsync_log(nilfs, pseg_start, sum, + &dsync_blocks); if (unlikely(err)) goto failed; - if (NILFS_SEG_LOGEND(&ssi)) { - err = recover_dsync_blocks( - sbi, &dsync_blocks, &nsalvaged_blocks); + if (flags & NILFS_SS_LOGEND) { + err = nilfs_recover_dsync_blocks( + nilfs, sbi, &dsync_blocks, + &nsalvaged_blocks); if (unlikely(err)) goto failed; state = RF_INIT_ST; @@ -627,7 +658,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, try_next_pseg: if (pseg_start == ri->ri_lsegs_end) break; - pseg_start += ssi.nblocks; + pseg_start += le32_to_cpu(sum->ss_nblocks); if (pseg_start < seg_end) continue; goto feed_segment; @@ -652,8 +683,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; } out: + brelse(bh_sum); dispose_recovery_list(&dsync_blocks); - nilfs_detach_writer(sbi->s_nilfs, sbi); + nilfs_detach_writer(nilfs, sbi); return err; confused: @@ -667,7 +699,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, } static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, - struct nilfs_sb_info *sbi, struct nilfs_recovery_info *ri) { struct buffer_head *bh; @@ -677,7 +708,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) return; - bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start); + bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); BUG_ON(!bh); memset(bh->b_data, 0, bh->b_size); set_buffer_dirty(bh); @@ -690,9 +721,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, } /** - * nilfs_recover_logical_segments - salvage logical segments written after - * the latest super root - * @nilfs: the_nilfs + * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint + * @nilfs: nilfs object * @sbi: nilfs_sb_info * @ri: pointer to a nilfs_recovery_info struct to store search results. * @@ -709,9 +739,9 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, * * %-ENOMEM - Insufficient memory available. */ -int nilfs_recover_logical_segments(struct the_nilfs *nilfs, - struct nilfs_sb_info *sbi, - struct nilfs_recovery_info *ri) +int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) { int err; @@ -751,7 +781,7 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs, goto failed; } - nilfs_finish_roll_forward(nilfs, sbi, ri); + nilfs_finish_roll_forward(nilfs, ri); } failed: @@ -762,7 +792,6 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs, /** * nilfs_search_super_root - search the latest valid super root * @nilfs: the_nilfs - * @sbi: nilfs_sb_info * @ri: pointer to a nilfs_recovery_info struct to store search results. * * nilfs_search_super_root() looks for the latest super-root from a partial @@ -775,14 +804,19 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs, * %-EINVAL - No valid segment found * * %-EIO - I/O error + * + * %-ENOMEM - Insufficient memory available. */ -int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, +int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { - struct nilfs_segsum_info ssi; + struct buffer_head *bh_sum = NULL; + struct nilfs_segment_summary *sum; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; + unsigned long nblocks; + unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; @@ -801,17 +835,24 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, /* Read ahead segment */ b = seg_start; while (b <= seg_end) - sb_breadahead(sbi->s_super, b++); + __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); for (;;) { - /* Load segment summary */ - ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); + brelse(bh_sum); + ret = NILFS_SEG_FAIL_IO; + bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); + if (!bh_sum) + goto failed; + + ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) goto failed; goto strayed; } - pseg_end = pseg_start + ssi.nblocks - 1; + + nblocks = le32_to_cpu(sum->ss_nblocks); + pseg_end = pseg_start + nblocks - 1; if (unlikely(pseg_end > seg_end)) { ret = NILFS_SEG_FAIL_CONSISTENCY; goto strayed; @@ -821,11 +862,13 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, ri->ri_pseg_start = pseg_start; ri->ri_seq = seg_seq; ri->ri_segnum = segnum; - nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + nextnum = nilfs_get_segnum_of_block(nilfs, + le64_to_cpu(sum->ss_next)); ri->ri_nextnum = nextnum; empty_seg = 0; - if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) { + flags = le16_to_cpu(sum->ss_flags); + if (!(flags & NILFS_SS_SR) && !scan_newer) { /* This will never happen because a superblock (last_segment) always points to a pseg having a super root. */ @@ -836,14 +879,15 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, if (pseg_start == seg_start) { nilfs_get_segment_range(nilfs, nextnum, &b, &end); while (b <= end) - sb_breadahead(sbi->s_super, b++); + __breadahead(nilfs->ns_bdev, b++, + nilfs->ns_blocksize); } - if (!NILFS_SEG_HAS_SR(&ssi)) { - if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { + if (!(flags & NILFS_SS_SR)) { + if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) { ri->ri_lsegs_start = pseg_start; ri->ri_lsegs_start_seq = seg_seq; } - if (NILFS_SEG_LOGEND(&ssi)) + if (flags & NILFS_SS_LOGEND) ri->ri_lsegs_end = pseg_start; goto try_next_pseg; } @@ -854,12 +898,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, ri->ri_lsegs_start = ri->ri_lsegs_end = 0; nilfs_dispose_segment_list(&segments); - nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start) - + ssi.nblocks - seg_start; + sr_pseg_start = pseg_start; + nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start; nilfs->ns_seg_seq = seg_seq; nilfs->ns_segnum = segnum; nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ - nilfs->ns_ctime = ssi.ctime; + nilfs->ns_ctime = le64_to_cpu(sum->ss_create); nilfs->ns_nextnum = nextnum; if (scan_newer) @@ -870,15 +914,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, scan_newer = 1; } - /* reset region for roll-forward */ - pseg_start += ssi.nblocks; - if (pseg_start < seg_end) - continue; - goto feed_segment; - try_next_pseg: /* Standing on a course, or met an inconsistent state */ - pseg_start += ssi.nblocks; + pseg_start += nblocks; if (pseg_start < seg_end) continue; goto feed_segment; @@ -909,6 +947,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, super_root_found: /* Updating pointers relating to the latest checkpoint */ + brelse(bh_sum); list_splice_tail(&segments, &ri->ri_used_segments); nilfs->ns_last_pseg = sr_pseg_start; nilfs->ns_last_seq = nilfs->ns_seg_seq; @@ -916,6 +955,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, return 0; failed: + brelse(bh_sum); nilfs_dispose_segment_list(&segments); return (ret < 0) ? ret : nilfs_warn_segment_error(ret); } diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 85fbb66..b04f08c 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -54,17 +54,6 @@ struct nilfs_segsum_info { sector_t next; }; -/* macro for the flags */ -#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR) -#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN) -#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND) -#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT) -#define NILFS_SEG_SIMPLEX(sum) \ - (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \ - (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) - -#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk) - /** * struct nilfs_segment_buffer - Segment buffer * @sb_super: back pointer to a superblock struct @@ -141,6 +130,19 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, struct buffer_head **); void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); +static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf) +{ + unsigned int flags = segbuf->sb_sum.flags; + + return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == + (NILFS_SS_LOGBGN | NILFS_SS_LOGEND); +} + +static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf) +{ + return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk; +} + static inline void nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, struct buffer_head *bh) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c920164..9fd051a 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1914,12 +1914,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) } } - if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) { - if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) { + if (!nilfs_segbuf_simplex(segbuf)) { + if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) { set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); sci->sc_lseg_stime = jiffies; } - if (NILFS_SEG_LOGEND(&segbuf->sb_sum)) + if (segbuf->sb_sum.flags & NILFS_SS_LOGEND) clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); } } @@ -1951,7 +1951,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) if (update_sr) { nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, segbuf->sb_sum.seg_seq, nilfs->ns_cno++); - set_nilfs_sb_dirty(nilfs); clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); @@ -2082,7 +2081,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) /* Avoid empty segment */ if (sci->sc_stage.scnt == NILFS_ST_DONE && - NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { + nilfs_segbuf_empty(sci->sc_curseg)) { nilfs_segctor_abort_construction(sci, nilfs, 1); goto out; } @@ -2408,6 +2407,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) { struct nilfs_sb_info *sbi = sci->sc_sbi; struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp; int err = 0; nilfs_segctor_accept(sci); @@ -2423,8 +2423,13 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && nilfs_discontinued(nilfs)) { down_write(&nilfs->ns_sem); - err = nilfs_commit_super( - sbi, nilfs_altsb_need_update(nilfs)); + err = -EIO; + sbp = nilfs_prepare_super(sbi, + nilfs_sb_will_flip(nilfs)); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + err = nilfs_commit_super(sbi, NILFS_SB_COMMIT); + } up_write(&nilfs->ns_sem); } } diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 01e20db..17c487b 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -234,13 +234,13 @@ extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); /* recovery.c */ -extern int nilfs_read_super_root_block(struct super_block *, sector_t, +extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t, struct buffer_head **, int); -extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, +extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_recovery_info *); -extern int nilfs_recover_logical_segments(struct the_nilfs *, - struct nilfs_sb_info *, - struct nilfs_recovery_info *); +extern int nilfs_salvage_orphan_logs(struct the_nilfs *, + struct nilfs_sb_info *, + struct nilfs_recovery_info *); extern void nilfs_dispose_segment_list(struct list_head *); #endif /* _NILFS_SEGMENT_H */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 414ef68..26078b3 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -55,6 +55,8 @@ #include "nilfs.h" #include "mdt.h" #include "alloc.h" +#include "btree.h" +#include "btnode.h" #include "page.h" #include "cpfile.h" #include "ifile.h" @@ -74,6 +76,25 @@ struct kmem_cache *nilfs_btree_path_cache; static int nilfs_remount(struct super_block *sb, int *flags, char *data); +static void nilfs_set_error(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp; + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { + nilfs->ns_mount_state |= NILFS_ERROR_FS; + sbp = nilfs_prepare_super(sbi, 0); + if (likely(sbp)) { + sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); + if (sbp[1]) + sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); + nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); + } + } + up_write(&nilfs->ns_sem); +} + /** * nilfs_error() - report failure condition on a filesystem * @@ -99,16 +120,7 @@ void nilfs_error(struct super_block *sb, const char *function, va_end(args); if (!(sb->s_flags & MS_RDONLY)) { - struct the_nilfs *nilfs = sbi->s_nilfs; - - down_write(&nilfs->ns_sem); - if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { - nilfs->ns_mount_state |= NILFS_ERROR_FS; - nilfs->ns_sbp[0]->s_state |= - cpu_to_le16(NILFS_ERROR_FS); - nilfs_commit_super(sbi, 1); - } - up_write(&nilfs->ns_sem); + nilfs_set_error(sbi); if (nilfs_test_opt(sbi, ERRORS_RO)) { printk(KERN_CRIT "Remounting filesystem read-only\n"); @@ -176,7 +188,7 @@ static void nilfs_clear_inode(struct inode *inode) nilfs_btnode_cache_clear(&ii->i_btnode_cache); } -static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) +static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) { struct the_nilfs *nilfs = sbi->s_nilfs; int err; @@ -202,12 +214,20 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) printk(KERN_ERR "NILFS: unable to write superblock (err=%d)\n", err); if (err == -EIO && nilfs->ns_sbh[1]) { + /* + * sbp[0] points to newer log than sbp[1], + * so copy sbp[0] to sbp[1] to take over sbp[0]. + */ + memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0], + nilfs->ns_sbsize); nilfs_fall_back_super_block(nilfs); goto retry; } } else { struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + nilfs->ns_sbwcount++; + /* * The latest segment becomes trailable from the position * written in superblock. @@ -216,66 +236,122 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) /* update GC protection for recent segments */ if (nilfs->ns_sbh[1]) { - sbp = NULL; - if (dupsb) { + if (flag == NILFS_SB_COMMIT_ALL) { set_buffer_dirty(nilfs->ns_sbh[1]); - if (!sync_dirty_buffer(nilfs->ns_sbh[1])) - sbp = nilfs->ns_sbp[1]; + if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0) + goto out; } + if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) < + le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno)) + sbp = nilfs->ns_sbp[1]; } - if (sbp) { - spin_lock(&nilfs->ns_last_segment_lock); - nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); - spin_unlock(&nilfs->ns_last_segment_lock); - } - } + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); + spin_unlock(&nilfs->ns_last_segment_lock); + } + out: return err; } -int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) +void nilfs_set_log_cursor(struct nilfs_super_block *sbp, + struct the_nilfs *nilfs) +{ + sector_t nfreeblocks; + + /* nilfs->ns_sem must be locked by the caller. */ + nilfs_count_free_blocks(nilfs, &nfreeblocks); + sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks); + + spin_lock(&nilfs->ns_last_segment_lock); + sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); + sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); + sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); + spin_unlock(&nilfs->ns_last_segment_lock); +} + +struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi, + int flip) { struct the_nilfs *nilfs = sbi->s_nilfs; struct nilfs_super_block **sbp = nilfs->ns_sbp; - sector_t nfreeblocks; - time_t t; - int err; - /* nilfs->sem must be locked by the caller. */ + /* nilfs->ns_sem must be locked by the caller. */ if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { - if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) - nilfs_swap_super_block(nilfs); - else { + if (sbp[1] && + sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { + memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); + } else { printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", sbi->s_super->s_id); - return -EIO; + return NULL; } + } else if (sbp[1] && + sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); } - err = nilfs_count_free_blocks(nilfs, &nfreeblocks); - if (unlikely(err)) { - printk(KERN_ERR "NILFS: failed to count free blocks\n"); - return err; - } - spin_lock(&nilfs->ns_last_segment_lock); - sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); - sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); - sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); - spin_unlock(&nilfs->ns_last_segment_lock); + if (flip && sbp[1]) + nilfs_swap_super_block(nilfs); + + return sbp; +} + +int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + time_t t; + + /* nilfs->ns_sem must be locked by the caller. */ t = get_seconds(); - nilfs->ns_sbwtime[0] = t; - sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks); + nilfs->ns_sbwtime = t; sbp[0]->s_wtime = cpu_to_le64(t); sbp[0]->s_sum = 0; sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, (unsigned char *)sbp[0], nilfs->ns_sbsize)); - if (dupsb && sbp[1]) { - memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); - nilfs->ns_sbwtime[1] = t; + if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) { + sbp[1]->s_wtime = sbp[0]->s_wtime; + sbp[1]->s_sum = 0; + sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[1], + nilfs->ns_sbsize)); } clear_nilfs_sb_dirty(nilfs); - return nilfs_sync_super(sbi, dupsb); + return nilfs_sync_super(sbi, flag); +} + +/** + * nilfs_cleanup_super() - write filesystem state for cleanup + * @sbi: nilfs_sb_info to be unmounted or degraded to read-only + * + * This function restores state flags in the on-disk super block. + * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the + * filesystem was not clean previously. + */ +int nilfs_cleanup_super(struct nilfs_sb_info *sbi) +{ + struct nilfs_super_block **sbp; + int flag = NILFS_SB_COMMIT; + int ret = -EIO; + + sbp = nilfs_prepare_super(sbi, 0); + if (sbp) { + sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state); + nilfs_set_log_cursor(sbp[0], sbi->s_nilfs); + if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { + /* + * make the "clean" flag also to the opposite + * super block if both super blocks point to + * the same checkpoint. + */ + sbp[1]->s_state = sbp[0]->s_state; + flag = NILFS_SB_COMMIT_ALL; + } + ret = nilfs_commit_super(sbi, flag); + } + return ret; } static void nilfs_put_super(struct super_block *sb) @@ -289,8 +365,7 @@ static void nilfs_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { down_write(&nilfs->ns_sem); - nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); - nilfs_commit_super(sbi, 1); + nilfs_cleanup_super(sbi); up_write(&nilfs->ns_sem); } down_write(&nilfs->ns_super_sem); @@ -311,6 +386,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait) { struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp; int err = 0; /* This function is called when super block should be written back */ @@ -318,8 +394,13 @@ static int nilfs_sync_fs(struct super_block *sb, int wait) err = nilfs_construct_segment(sb); down_write(&nilfs->ns_sem); - if (nilfs_sb_dirty(nilfs)) - nilfs_commit_super(sbi, 1); + if (nilfs_sb_dirty(nilfs)) { + sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs)); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + nilfs_commit_super(sbi, NILFS_SB_COMMIT); + } + } up_write(&nilfs->ns_sem); return err; @@ -442,20 +523,20 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) struct nilfs_sb_info *sbi = NILFS_SB(sb); if (!nilfs_test_opt(sbi, BARRIER)) - seq_printf(seq, ",nobarrier"); + seq_puts(seq, ",nobarrier"); if (nilfs_test_opt(sbi, SNAPSHOT)) seq_printf(seq, ",cp=%llu", (unsigned long long int)sbi->s_snapshot_cno); if (nilfs_test_opt(sbi, ERRORS_PANIC)) - seq_printf(seq, ",errors=panic"); + seq_puts(seq, ",errors=panic"); if (nilfs_test_opt(sbi, ERRORS_CONT)) - seq_printf(seq, ",errors=continue"); + seq_puts(seq, ",errors=continue"); if (nilfs_test_opt(sbi, STRICT_ORDER)) - seq_printf(seq, ",order=strict"); + seq_puts(seq, ",order=strict"); if (nilfs_test_opt(sbi, NORECOVERY)) - seq_printf(seq, ",norecovery"); + seq_puts(seq, ",norecovery"); if (nilfs_test_opt(sbi, DISCARD)) - seq_printf(seq, ",discard"); + seq_puts(seq, ",discard"); return 0; } @@ -524,23 +605,25 @@ static const struct export_operations nilfs_export_ops = { enum { Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, - Opt_discard, Opt_err, + Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, + Opt_discard, Opt_nodiscard, Opt_err, }; static match_table_t tokens = { {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, + {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_snapshot, "cp=%u"}, {Opt_order, "order=%s"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL} }; -static int parse_options(char *options, struct super_block *sb) +static int parse_options(char *options, struct super_block *sb, int is_remount) { struct nilfs_sb_info *sbi = NILFS_SB(sb); char *p; @@ -557,6 +640,9 @@ static int parse_options(char *options, struct super_block *sb) token = match_token(p, tokens, args); switch (token) { + case Opt_barrier: + nilfs_set_opt(sbi, BARRIER); + break; case Opt_nobarrier: nilfs_clear_opt(sbi, BARRIER); break; @@ -582,8 +668,26 @@ static int parse_options(char *options, struct super_block *sb) case Opt_snapshot: if (match_int(&args[0], &option) || option <= 0) return 0; - if (!(sb->s_flags & MS_RDONLY)) + if (is_remount) { + if (!nilfs_test_opt(sbi, SNAPSHOT)) { + printk(KERN_ERR + "NILFS: cannot change regular " + "mount to snapshot.\n"); + return 0; + } else if (option != sbi->s_snapshot_cno) { + printk(KERN_ERR + "NILFS: cannot remount to a " + "different snapshot.\n"); + return 0; + } + break; + } + if (!(sb->s_flags & MS_RDONLY)) { + printk(KERN_ERR "NILFS: cannot mount snapshot " + "read/write. A read-only option is " + "required.\n"); return 0; + } sbi->s_snapshot_cno = option; nilfs_set_opt(sbi, SNAPSHOT); break; @@ -593,6 +697,9 @@ static int parse_options(char *options, struct super_block *sb) case Opt_discard: nilfs_set_opt(sbi, DISCARD); break; + case Opt_nodiscard: + nilfs_clear_opt(sbi, DISCARD); + break; default: printk(KERN_ERR "NILFS: Unrecognized mount option \"%s\"\n", p); @@ -613,11 +720,18 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi, static int nilfs_setup_super(struct nilfs_sb_info *sbi) { struct the_nilfs *nilfs = sbi->s_nilfs; - struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; - int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count); - int mnt_count = le16_to_cpu(sbp->s_mnt_count); + struct nilfs_super_block **sbp; + int max_mnt_count; + int mnt_count; + + /* nilfs->ns_sem must be locked by the caller. */ + sbp = nilfs_prepare_super(sbi, 0); + if (!sbp) + return -EIO; + + max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); + mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); - /* nilfs->sem must be locked by the caller. */ if (nilfs->ns_mount_state & NILFS_ERROR_FS) { printk(KERN_WARNING "NILFS warning: mounting fs with errors\n"); @@ -628,12 +742,15 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi) #endif } if (!max_mnt_count) - sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); - - sbp->s_mnt_count = cpu_to_le16(mnt_count + 1); - sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS); - sbp->s_mtime = cpu_to_le64(get_seconds()); - return nilfs_commit_super(sbi, 1); + sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); + + sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); + sbp[0]->s_state = + cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); + sbp[0]->s_mtime = cpu_to_le64(get_seconds()); + /* synchronize sbp[1] with sbp[0] */ + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); } struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, @@ -670,7 +787,31 @@ int nilfs_store_magic_and_option(struct super_block *sb, sbi->s_interval = le32_to_cpu(sbp->s_c_interval); sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); - return !parse_options(data, sb) ? -EINVAL : 0 ; + return !parse_options(data, sb, 0) ? -EINVAL : 0 ; +} + +int nilfs_check_feature_compatibility(struct super_block *sb, + struct nilfs_super_block *sbp) +{ + __u64 features; + + features = le64_to_cpu(sbp->s_feature_incompat) & + ~NILFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "NILFS: couldn't mount because of unsupported " + "optional features (%llx)\n", + (unsigned long long)features); + return -EINVAL; + } + features = le64_to_cpu(sbp->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "NILFS: couldn't mount RDWR because of " + "unsupported optional features (%llx)\n", + (unsigned long long)features); + return -EINVAL; + } + return 0; } /** @@ -819,7 +960,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, static int nilfs_remount(struct super_block *sb, int *flags, char *data) { struct nilfs_sb_info *sbi = NILFS_SB(sb); - struct nilfs_super_block *sbp; struct the_nilfs *nilfs = sbi->s_nilfs; unsigned long old_sb_flags; struct nilfs_mount_options old_opts; @@ -833,32 +973,17 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) old_opts.snapshot_cno = sbi->s_snapshot_cno; was_snapshot = nilfs_test_opt(sbi, SNAPSHOT); - if (!parse_options(data, sb)) { + if (!parse_options(data, sb, 1)) { err = -EINVAL; goto restore_opts; } sb->s_flags = (sb->s_flags & ~MS_POSIXACL); err = -EINVAL; - if (was_snapshot) { - if (!(*flags & MS_RDONLY)) { - printk(KERN_ERR "NILFS (device %s): cannot remount " - "snapshot read/write.\n", - sb->s_id); - goto restore_opts; - } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) { - printk(KERN_ERR "NILFS (device %s): cannot " - "remount to a different snapshot.\n", - sb->s_id); - goto restore_opts; - } - } else { - if (nilfs_test_opt(sbi, SNAPSHOT)) { - printk(KERN_ERR "NILFS (device %s): cannot change " - "a regular mount to a snapshot.\n", - sb->s_id); - goto restore_opts; - } + if (was_snapshot && !(*flags & MS_RDONLY)) { + printk(KERN_ERR "NILFS (device %s): cannot remount snapshot " + "read/write.\n", sb->s_id); + goto restore_opts; } if (!nilfs_valid_fs(nilfs)) { @@ -880,19 +1005,29 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) * the RDONLY flag and then mark the partition as valid again. */ down_write(&nilfs->ns_sem); - sbp = nilfs->ns_sbp[0]; - if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) && - (nilfs->ns_mount_state & NILFS_VALID_FS)) - sbp->s_state = cpu_to_le16(nilfs->ns_mount_state); - sbp->s_mtime = cpu_to_le64(get_seconds()); - nilfs_commit_super(sbi, 1); + nilfs_cleanup_super(sbi); up_write(&nilfs->ns_sem); } else { + __u64 features; + /* * Mounting a RDONLY partition read-write, so reread and * store the current valid flag. (It may have been changed * by fsck since we originally mounted the partition.) */ + down_read(&nilfs->ns_sem); + features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + up_read(&nilfs->ns_sem); + if (features) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount RDWR because of unsupported optional " + "features (%llx)\n", + sb->s_id, (unsigned long long)features); + err = -EROFS; + goto restore_opts; + } + sb->s_flags &= ~MS_RDONLY; err = nilfs_attach_segment_constructor(sbi); @@ -1119,7 +1254,7 @@ static void nilfs_inode_init_once(void *obj) init_rwsem(&ii->xattr_sem); #endif nilfs_btnode_cache_init_once(&ii->i_btnode_cache); - ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; + ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8c10973..37de1f0 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -38,6 +38,8 @@ static LIST_HEAD(nilfs_objects); static DEFINE_SPINLOCK(nilfs_lock); +static int nilfs_valid_sb(struct nilfs_super_block *sbp); + void nilfs_set_last_segment(struct the_nilfs *nilfs, sector_t start_blocknr, u64 seq, __u64 cno) { @@ -45,6 +47,16 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs, nilfs->ns_last_pseg = start_blocknr; nilfs->ns_last_seq = seq; nilfs->ns_last_cno = cno; + + if (!nilfs_sb_dirty(nilfs)) { + if (nilfs->ns_prev_seq == nilfs->ns_last_seq) + goto stay_cursor; + + set_nilfs_sb_dirty(nilfs); + } + nilfs->ns_prev_seq = nilfs->ns_last_seq; + + stay_cursor: spin_unlock(&nilfs->ns_last_segment_lock); } @@ -159,8 +171,7 @@ void put_nilfs(struct the_nilfs *nilfs) kfree(nilfs); } -static int nilfs_load_super_root(struct the_nilfs *nilfs, - struct nilfs_sb_info *sbi, sector_t sr_block) +static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block) { struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; @@ -169,7 +180,7 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, unsigned inode_size; int err; - err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1); + err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1); if (unlikely(err)) return err; @@ -248,6 +259,37 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) } /** + * nilfs_store_log_cursor - load log cursor from a super block + * @nilfs: nilfs object + * @sbp: buffer storing super block to be read + * + * nilfs_store_log_cursor() reads the last position of the log + * containing a super root from a given super block, and initializes + * relevant information on the nilfs object preparatory for log + * scanning and recovery. + */ +static int nilfs_store_log_cursor(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + int ret = 0; + + nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); + nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); + nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); + + nilfs->ns_prev_seq = nilfs->ns_last_seq; + nilfs->ns_seg_seq = nilfs->ns_last_seq; + nilfs->ns_segnum = + nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); + nilfs->ns_cno = nilfs->ns_last_cno + 1; + if (nilfs->ns_segnum >= nilfs->ns_nsegments) { + printk(KERN_ERR "NILFS invalid last segment number.\n"); + ret = -EINVAL; + } + return ret; +} + +/** * load_nilfs - load and recover the nilfs * @nilfs: the_nilfs structure to be released * @sbi: nilfs_sb_info used to recover past segment @@ -285,13 +327,55 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) nilfs_init_recovery_info(&ri); - err = nilfs_search_super_root(nilfs, sbi, &ri); + err = nilfs_search_super_root(nilfs, &ri); if (unlikely(err)) { - printk(KERN_ERR "NILFS: error searching super root.\n"); - goto failed; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + int blocksize; + + if (err != -EINVAL) + goto scan_error; + + if (!nilfs_valid_sb(sbp[1])) { + printk(KERN_WARNING + "NILFS warning: unable to fall back to spare" + "super block\n"); + goto scan_error; + } + printk(KERN_INFO + "NILFS: try rollback from an earlier position\n"); + + /* + * restore super block with its spare and reconfigure + * relevant states of the nilfs object. + */ + memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); + nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed); + nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); + + /* verify consistency between two super blocks */ + blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); + if (blocksize != nilfs->ns_blocksize) { + printk(KERN_WARNING + "NILFS warning: blocksize differs between " + "two super blocks (%d != %d)\n", + blocksize, nilfs->ns_blocksize); + goto scan_error; + } + + err = nilfs_store_log_cursor(nilfs, sbp[0]); + if (err) + goto scan_error; + + /* drop clean flag to allow roll-forward and recovery */ + nilfs->ns_mount_state &= ~NILFS_VALID_FS; + valid_fs = 0; + + err = nilfs_search_super_root(nilfs, &ri); + if (err) + goto scan_error; } - err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root); + err = nilfs_load_super_root(nilfs, ri.ri_super_root); if (unlikely(err)) { printk(KERN_ERR "NILFS: error loading super root.\n"); goto failed; @@ -301,11 +385,23 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) goto skip_recovery; if (s_flags & MS_RDONLY) { + __u64 features; + if (nilfs_test_opt(sbi, NORECOVERY)) { printk(KERN_INFO "NILFS: norecovery option specified. " "skipping roll-forward recovery\n"); goto skip_recovery; } + features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + if (features) { + printk(KERN_ERR "NILFS: couldn't proceed with " + "recovery because of unsupported optional " + "features (%llx)\n", + (unsigned long long)features); + err = -EROFS; + goto failed_unload; + } if (really_read_only) { printk(KERN_ERR "NILFS: write access " "unavailable, cannot proceed.\n"); @@ -320,14 +416,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) goto failed_unload; } - err = nilfs_recover_logical_segments(nilfs, sbi, &ri); + err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri); if (err) goto failed_unload; down_write(&nilfs->ns_sem); - nilfs->ns_mount_state |= NILFS_VALID_FS; - nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); - err = nilfs_commit_super(sbi, 1); + nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */ + err = nilfs_cleanup_super(sbi); up_write(&nilfs->ns_sem); if (err) { @@ -343,6 +438,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) sbi->s_super->s_flags = s_flags; return 0; + scan_error: + printk(KERN_ERR "NILFS: error searching super root.\n"); + goto failed; + failed_unload: nilfs_mdt_destroy(nilfs->ns_cpfile); nilfs_mdt_destroy(nilfs->ns_sufile); @@ -515,8 +614,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, nilfs_swap_super_block(nilfs); } - nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime); - nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0; + nilfs->ns_sbwcount = 0; + nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); *sbpp = sbp[0]; return 0; @@ -557,6 +656,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) if (err) goto out; + err = nilfs_check_feature_compatibility(sb, sbp); + if (err) + goto out; + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); if (sb->s_blocksize != blocksize && !sb_set_blocksize(sb, blocksize)) { @@ -568,7 +671,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) goto out; } - blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); if (!blocksize) { printk(KERN_ERR "NILFS: unable to set blocksize\n"); err = -EINVAL; @@ -582,7 +685,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) if (err) goto failed_sbh; + err = nilfs_check_feature_compatibility(sb, sbp); + if (err) + goto failed_sbh; + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (blocksize < NILFS_MIN_BLOCK_SIZE || + blocksize > NILFS_MAX_BLOCK_SIZE) { + printk(KERN_ERR "NILFS: couldn't mount because of unsupported " + "filesystem blocksize %d\n", blocksize); + err = -EINVAL; + goto failed_sbh; + } if (sb->s_blocksize != blocksize) { int hw_blocksize = bdev_logical_block_size(sb->s_bdev); @@ -604,6 +718,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) when reloading fails. */ } nilfs->ns_blocksize_bits = sb->s_blocksize_bits; + nilfs->ns_blocksize = blocksize; err = nilfs_store_disk_layout(nilfs, sbp); if (err) @@ -616,23 +731,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; nilfs->ns_bdi = bdi ? : &default_backing_dev_info; - /* Finding last segment */ - nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); - nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); - nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); - - nilfs->ns_seg_seq = nilfs->ns_last_seq; - nilfs->ns_segnum = - nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); - nilfs->ns_cno = nilfs->ns_last_cno + 1; - if (nilfs->ns_segnum >= nilfs->ns_nsegments) { - printk(KERN_ERR "NILFS invalid last segment number.\n"); - err = -EINVAL; + err = nilfs_store_log_cursor(nilfs, sbp); + if (err) goto failed_sbh; - } - /* Dummy values */ - nilfs->ns_free_segments_count = - nilfs->ns_nsegments - (nilfs->ns_segnum + 1); /* Initialize gcinode cache */ err = nilfs_init_gccache(nilfs); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 1ab9745..f785a7b 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -57,7 +57,8 @@ enum { * @ns_current: back pointer to current mount * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data - * @ns_sbwtime: previous write time of super blocks + * @ns_sbwtime: previous write time of super block + * @ns_sbwcount: write count of super block * @ns_sbsize: size of valid data in super block * @ns_supers: list of nilfs super block structs * @ns_seg_seq: segment sequence counter @@ -73,7 +74,7 @@ enum { * @ns_last_seq: sequence value of the latest segment * @ns_last_cno: checkpoint number of the latest segment * @ns_prot_seq: least sequence number of segments which must not be reclaimed - * @ns_free_segments_count: counter of free segments + * @ns_prev_seq: base sequence number used to decide if advance log cursor * @ns_segctor_sem: segment constructor semaphore * @ns_dat: DAT file inode * @ns_cpfile: checkpoint file inode @@ -82,6 +83,7 @@ enum { * @ns_gc_inodes: dummy inodes to keep live blocks * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks * @ns_blocksize_bits: bit length of block size + * @ns_blocksize: block size * @ns_nsegments: number of segments in filesystem * @ns_blocks_per_segment: number of blocks per segment * @ns_r_segments_percentage: reserved segments percentage @@ -119,7 +121,8 @@ struct the_nilfs { */ struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; - time_t ns_sbwtime[2]; + time_t ns_sbwtime; + unsigned ns_sbwcount; unsigned ns_sbsize; unsigned ns_mount_state; @@ -149,7 +152,7 @@ struct the_nilfs { u64 ns_last_seq; __u64 ns_last_cno; u64 ns_prot_seq; - unsigned long ns_free_segments_count; + u64 ns_prev_seq; struct rw_semaphore ns_segctor_sem; @@ -168,6 +171,7 @@ struct the_nilfs { /* Disk layout information (static) */ unsigned int ns_blocksize_bits; + unsigned int ns_blocksize; unsigned long ns_nsegments; unsigned long ns_blocks_per_segment; unsigned long ns_r_segments_percentage; @@ -203,20 +207,17 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty) /* Minimum interval of periodical update of superblocks (in seconds) */ #define NILFS_SB_FREQ 10 -#define NILFS_ALTSB_FREQ 60 /* spare superblock */ static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { u64 t = get_seconds(); - return t < nilfs->ns_sbwtime[0] || - t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ; + return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ; } -static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) +static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) { - u64 t = get_seconds(); - struct nilfs_super_block **sbp = nilfs->ns_sbp; - return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; + int flip_bits = nilfs->ns_sbwcount & 0x0FL; + return (flip_bits != 0x08 && flip_bits != 0x0F); } void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3623ca2..96337a4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, dump_stack(); goto bail; } - - past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, - (unsigned long long)past_eof); - - if (create && (iblock >= past_eof)) - set_buffer_new(bh_result); } + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, + (unsigned long long)past_eof); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + bail: if (err < 0) err = -EIO; @@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle, return ret; } -handle_t *ocfs2_start_walk_page_trans(struct inode *inode, - struct page *page, - unsigned from, - unsigned to) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle; - int ret = 0; - - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) - mlog_errno(ret); - } -out: - if (ret) { - if (!IS_ERR(handle)) - ocfs2_commit_trans(osb, handle); - handle = ERR_PTR(ret); - } - return handle; -} - static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) { sector_t status; @@ -609,7 +578,9 @@ bail: static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, - void *private) + void *private, + int ret, + bool is_async) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int level; @@ -623,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (!level) up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); + + if (is_async) + aio_complete(iocb, ret, 0); } /* @@ -1131,23 +1105,37 @@ out: */ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, int new, + u32 cpos, loff_t user_pos, + unsigned user_len, int new, struct page *mmap_page) { int ret = 0, i; - unsigned long start, target_index, index; + unsigned long start, target_index, end_index, index; struct inode *inode = mapping->host; + loff_t last_byte; target_index = user_pos >> PAGE_CACHE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For * non allocating write, we just change the one - * page. Otherwise, we'll need a whole clusters worth. + * page. Otherwise, we'll need a whole clusters worth. If we're + * writing past i_size, we only need enough pages to cover the + * last page of the write. */ if (new) { wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); + /* + * We need the index *past* the last page we could possibly + * touch. This is the page past the end of the write or + * i_size, whichever is greater. + */ + last_byte = max(user_pos + user_len, i_size_read(inode)); + BUG_ON(last_byte < 1); + end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + if ((start + wc->w_num_pages) > end_index) + wc->w_num_pages = end_index - start; } else { wc->w_num_pages = 1; start = target_index; @@ -1620,21 +1608,20 @@ out: * write path can treat it as an non-allocating write, which has no * special case code for sparse/nonsparse files. */ -static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, - unsigned len, +static int ocfs2_expand_nonsparse_inode(struct inode *inode, + struct buffer_head *di_bh, + loff_t pos, unsigned len, struct ocfs2_write_ctxt *wc) { int ret; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); loff_t newsize = pos + len; - if (ocfs2_sparse_alloc(osb)) - return 0; + BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, pos); + ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos); if (ret) mlog_errno(ret); @@ -1644,6 +1631,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, return ret; } +static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, + loff_t pos) +{ + int ret = 0; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + if (pos > i_size_read(inode)) + ret = ocfs2_zero_extend(inode, di_bh, pos); + + return ret; +} + int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, @@ -1679,7 +1678,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } } - ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, + wc); if (ret) { mlog_errno(ret); goto out; @@ -1789,7 +1792,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * that we can zero and flush if we error after adding the * extent. */ - ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 0cd24cf..5efdd37 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -419,7 +419,7 @@ static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence) static int debug_buffer_release(struct inode *inode, struct file *file) { - struct debug_buffer *db = (struct debug_buffer *)file->private_data; + struct debug_buffer *db = file->private_data; if (db) kfree(db->buf); @@ -715,7 +715,7 @@ static int debug_lockres_open(struct inode *inode, struct file *file) goto bail; } - seq = (struct seq_file *) file->private_data; + seq = file->private_data; seq->private = dl; dlm_grab(dlm); @@ -731,7 +731,7 @@ bail: static int debug_lockres_release(struct inode *inode, struct file *file) { - struct seq_file *seq = (struct seq_file *)file->private_data; + struct seq_file *seq = file->private_data; struct debug_lockres *dl = (struct debug_lockres *)seq->private; if (dl->dl_res) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6b5a492..153abb5 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain, struct dlm_ctxt *dlm = NULL; struct dlm_ctxt *new_ctxt = NULL; - if (strlen(domain) > O2NM_MAX_NAME_LEN) { + if (strlen(domain) >= O2NM_MAX_NAME_LEN) { ret = -ENAMETOOLONG; mlog(ML_ERROR, "domain name length too long\n"); goto leave; @@ -1709,6 +1709,7 @@ retry: } if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) { + spin_unlock(&dlm_domain_lock); mlog(ML_ERROR, "Requested locking protocol version is not " "compatible with already registered domain " diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4a7506a..94b97fc 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2808,14 +2808,8 @@ again: mlog(0, "trying again...\n"); goto again; } - /* now that we are sure the MIGRATING state is there, drop - * the unneded state which blocked threads trying to DIRTY */ - spin_lock(&res->spinlock); - BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); - BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); - res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; - spin_unlock(&res->spinlock); + ret = 0; /* did the target go down or die? */ spin_lock(&dlm->spinlock); if (!test_bit(target, dlm->domain_map)) { @@ -2826,9 +2820,21 @@ again: spin_unlock(&dlm->spinlock); /* + * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for + * another try; otherwise, we are sure the MIGRATING state is there, + * drop the unneded state which blocked threads trying to DIRTY + */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + if (!ret) + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + spin_unlock(&res->spinlock); + + /* * at this point: * - * o the DLM_LOCK_RES_MIGRATING flag is set + * o the DLM_LOCK_RES_MIGRATING flag is set if target not down * o there are no pending asts on this lockres * o all processes trying to reserve an ast on this * lockres must wait for the MIGRATING flag to clear diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f8b75ce..9dfaac7 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); + bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b83d610..bef34d0 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -182,8 +182,7 @@ static int dlmfs_file_release(struct inode *inode, { int level, status; struct dlmfs_inode_private *ip = DLMFS_I(inode); - struct dlmfs_filp_private *fp = - (struct dlmfs_filp_private *) file->private_data; + struct dlmfs_filp_private *fp = file->private_data; if (S_ISDIR(inode->i_mode)) BUG(); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 39eb16a..5e02a89 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2966,7 +2966,7 @@ static const struct seq_operations ocfs2_dlm_seq_ops = { static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) { - struct seq_file *seq = (struct seq_file *) file->private_data; + struct seq_file *seq = file->private_data; struct ocfs2_dlm_seq_priv *priv = seq->private; struct ocfs2_lock_res *res = &priv->p_iter_res; @@ -3000,7 +3000,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) goto out; } - seq = (struct seq_file *) file->private_data; + seq = file->private_data; seq->private = priv; ocfs2_add_lockres_tracking(&priv->p_iter_res, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6a13ea6..2b10b36 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -724,28 +724,55 @@ leave: return status; } +/* + * While a write will already be ordering the data, a truncate will not. + * Thus, we need to explicitly order the zeroed pages. + */ +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + int ret = 0; + + if (!ocfs2_should_order_data(inode)) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) + mlog_errno(ret); + +out: + if (ret) { + if (!IS_ERR(handle)) + ocfs2_commit_trans(osb, handle); + handle = ERR_PTR(ret); + } + return handle; +} + /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ -static int ocfs2_write_zero_page(struct inode *inode, - u64 size) +static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, + u64 abs_to) { struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long index; - unsigned int offset; + unsigned long index = abs_from >> PAGE_CACHE_SHIFT; handle_t *handle = NULL; - int ret; + int ret = 0; + unsigned zero_from, zero_to, block_start, block_end; - offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - offset++; - } - index = size >> PAGE_CACHE_SHIFT; + BUG_ON(abs_from >= abs_to); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_from & (inode->i_blkbits - 1)); page = grab_cache_page(mapping, index); if (!page) { @@ -754,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode, goto out; } - ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); - if (ret < 0) { - mlog_errno(ret); - goto out_unlock; - } + /* Get the offsets within the page that we want to zero */ + zero_from = abs_from & (PAGE_CACHE_SIZE - 1); + zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + if (!zero_to) + zero_to = PAGE_CACHE_SIZE; - if (ocfs2_should_order_data(inode)) { - handle = ocfs2_start_walk_page_trans(inode, page, offset, - offset); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + mlog(0, + "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n", + (unsigned long long)abs_from, (unsigned long long)abs_to, + index, zero_from, zero_to); + + /* We know that zero_from is block aligned */ + for (block_start = zero_from; block_start < zero_to; + block_start = block_end) { + block_end = block_start + (1 << inode->i_blkbits); + + /* + * block_start is block-aligned. Bump it by one to + * force ocfs2_{prepare,commit}_write() to zero the + * whole block. + */ + ret = ocfs2_prepare_write_nolock(inode, page, + block_start + 1, + block_start + 1); + if (ret < 0) { + mlog_errno(ret); goto out_unlock; } - } - /* must not update i_size! */ - ret = block_commit_write(page, offset, offset); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + if (!handle) { + handle = ocfs2_zero_start_ordered_transaction(inode); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + break; + } + } + + /* must not update i_size! */ + ret = block_commit_write(page, block_start + 1, + block_start + 1); + if (ret < 0) + mlog_errno(ret); + else + ret = 0; + } if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + out_unlock: unlock_page(page); page_cache_release(page); @@ -786,22 +838,114 @@ out: return ret; } -static int ocfs2_zero_extend(struct inode *inode, - u64 zero_to_size) +/* + * Find the next range to zero. We do this in terms of bytes because + * that's what ocfs2_zero_extend() wants, and it is dealing with the + * pagecache. We may return multiple extents. + * + * zero_start and zero_end are ocfs2_zero_extend()s current idea of what + * needs to be zeroed. range_start and range_end return the next zeroing + * range. A subsequent call should pass the previous range_end as its + * zero_start. If range_end is 0, there's nothing to do. + * + * Unwritten extents are skipped over. Refcounted extents are CoWd. + */ +static int ocfs2_zero_extend_get_range(struct inode *inode, + struct buffer_head *di_bh, + u64 zero_start, u64 zero_end, + u64 *range_start, u64 *range_end) { - int ret = 0; - u64 start_off; - struct super_block *sb = inode->i_sb; + int rc = 0, needs_cow = 0; + u32 p_cpos, zero_clusters = 0; + u32 zero_cpos = + zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; - start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); - while (start_off < zero_to_size) { - ret = ocfs2_write_zero_page(inode, start_off); - if (ret < 0) { - mlog_errno(ret); + while (zero_cpos < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + zero_clusters = num_clusters; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + break; + } + + zero_cpos += num_clusters; + } + if (!zero_clusters) { + *range_end = 0; + goto out; + } + + while ((zero_cpos + zero_clusters) < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, + &p_cpos, &num_clusters, + &ext_flags); + if (rc) { + mlog_errno(rc); goto out; } - start_off += sb->s_blocksize; + if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) + break; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + zero_clusters += num_clusters; + } + if ((zero_cpos + zero_clusters) > last_cpos) + zero_clusters = last_cpos - zero_cpos; + + if (needs_cow) { + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, + UINT_MAX); + if (rc) { + mlog_errno(rc); + goto out; + } + } + + *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); + *range_end = ocfs2_clusters_to_bytes(inode->i_sb, + zero_cpos + zero_clusters); + +out: + return rc; +} + +/* + * Zero one range returned from ocfs2_zero_extend_get_range(). The caller + * has made sure that the entire range needs zeroing. + */ +static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, + u64 range_end) +{ + int rc = 0; + u64 next_pos; + u64 zero_pos = range_start; + + mlog(0, "range_start = %llu, range_end = %llu\n", + (unsigned long long)range_start, + (unsigned long long)range_end); + BUG_ON(range_start >= range_end); + + while (zero_pos < range_end) { + next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + if (next_pos > range_end) + next_pos = range_end; + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + if (rc < 0) { + mlog_errno(rc); + break; + } + zero_pos = next_pos; /* * Very large extends have the potential to lock up @@ -810,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode, cond_resched(); } -out: + return rc; +} + +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to_size) +{ + int ret = 0; + u64 zero_start, range_start = 0, range_end = 0; + struct super_block *sb = inode->i_sb; + + zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); + mlog(0, "zero_start %llu for i_size %llu\n", + (unsigned long long)zero_start, + (unsigned long long)i_size_read(inode)); + while (zero_start < zero_to_size) { + ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, + zero_to_size, + &range_start, + &range_end); + if (ret) { + mlog_errno(ret); + break; + } + if (!range_end) + break; + /* Trim the ends */ + if (range_start < zero_start) + range_start = zero_start; + if (range_end > zero_to_size) + range_end = zero_to_size; + + ret = ocfs2_zero_extend_range(inode, range_start, + range_end); + if (ret) { + mlog_errno(ret); + break; + } + zero_start = range_end; + } + return ret; } -int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to) { int ret; u32 clusters_to_add; struct ocfs2_inode_info *oi = OCFS2_I(inode); + /* + * Only quota files call this without a bh, and they can't be + * refcounted. + */ + BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); if (clusters_to_add < oi->ip_clusters) clusters_to_add = 0; @@ -840,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) * still need to zero the area between the old i_size and the * new i_size. */ - ret = ocfs2_zero_extend(inode, zero_to); + ret = ocfs2_zero_extend(inode, di_bh, zero_to); if (ret < 0) mlog_errno(ret); @@ -862,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode, goto out; if (i_size_read(inode) == new_i_size) - goto out; + goto out; BUG_ON(new_i_size < i_size_read(inode)); /* - * Fall through for converting inline data, even if the fs - * supports sparse files. - * - * The check for inline data here is legal - nobody can add - * the feature since we have i_mutex. We must check it again - * after acquiring ip_alloc_sem though, as paths like mmap - * might have raced us to converting the inode to extents. - */ - if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - goto out_update_size; - - /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on * i_mutex to block other extend/truncate calls while we're - * here. + * here. We even have to hold it for sparse files because there + * might be some tail zeroing. */ down_write(&oi->ip_alloc_sem); @@ -899,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode, ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); if (ret) { up_write(&oi->ip_alloc_sem); - mlog_errno(ret); goto out; } } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, new_i_size); + else + ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, + new_i_size); up_write(&oi->ip_alloc_sem); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index d66cf4f..97bf761 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); -int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, - u64 zero_to); +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to); +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 47878cf..9b57c03 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger return container_of(triggers, struct ocfs2_triggers, ot_triggers); } -static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Quota blocks have their own trigger because the struct ocfs2_block_check * offset depends on the blocksize. */ -static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Directory blocks also have their own trigger because the * struct ocfs2_block_check offset depends on the blocksize. */ -static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, static struct ocfs2_triggers di_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dinode, i_check), @@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = { static struct ocfs2_triggers eb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_extent_block, h_check), @@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = { static struct ocfs2_triggers rb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), @@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = { static struct ocfs2_triggers gd_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), @@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = { static struct ocfs2_triggers db_triggers = { .ot_triggers = { - .t_commit = ocfs2_db_commit_trigger, + .t_frozen = ocfs2_db_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers xb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), @@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = { static struct ocfs2_triggers dq_triggers = { .ot_triggers = { - .t_commit = ocfs2_dq_commit_trigger, + .t_frozen = ocfs2_dq_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers dr_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), @@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = { static struct ocfs2_triggers dl_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), @@ -760,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) if (osb->osb_commit_interval) commit_interval = osb->osb_commit_interval; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) @@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work) mutex_lock(&os->os_lock); ocfs2_queue_orphan_scan(osb); if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) - schedule_delayed_work(&os->os_orphan_scan_work, + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); mutex_unlock(&os->os_lock); } @@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); - schedule_delayed_work(&os->os_orphan_scan_work, - ocfs2_orphan_scan_timeout()); + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); } } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 3d74196..ec6adbf 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) { unsigned int la_mb; unsigned int gd_mb; + unsigned int la_max_mb; unsigned int megs_per_slot; struct super_block *sb = osb->sb; @@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) if (megs_per_slot < la_mb) la_mb = megs_per_slot; + /* We can't store more bits than we can in a block. */ + la_max_mb = ocfs2_clusters_to_megabytes(osb->sb, + ocfs2_local_alloc_size(sb) * 8); + if (la_mb > la_max_mb) + la_mb = la_max_mb; + return la_mb; } diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2bb35fe..4607923 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) * locking allocators ranks above a transaction start */ WARN_ON(journal_current_handle()); - status = ocfs2_extend_no_holes(gqinode, + status = ocfs2_extend_no_holes(gqinode, NULL, gqinode->i_size + (need_alloc << sb->s_blocksize_bits), gqinode->i_size); if (status < 0) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 8bd70d4..dc78764 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ - status = ocfs2_extend_no_holes(lqinode, + status = ocfs2_extend_no_holes(lqinode, NULL, lqinode->i_size + 2 * sb->s_blocksize, lqinode->i_size); if (status < 0) { @@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( return ocfs2_local_quota_add_chunk(sb, type, offset); /* We are protected by dqio_sem so no locking needed */ - status = ocfs2_extend_no_holes(lqinode, + status = ocfs2_extend_no_holes(lqinode, NULL, lqinode->i_size + sb->s_blocksize, lqinode->i_size); if (status < 0) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 4793f36..3ac5aa7 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + /* + * We only duplicate pages until we reach the page contains i_size - 1. + * So trim 'end' to i_size. + */ + if (end > i_size_read(context->inode)) + end = i_size_read(context->inode); while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; @@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry, struct inode *inode = old_dentry->d_inode; struct buffer_head *new_bh = NULL; + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = filemap_fdatawrite(inode->i_mapping); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f4c2a9e..a8e6a95 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le16_to_cpu(bg->bg_free_bits_count)); le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); - cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno); + cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) le16_add_cpu(&cl->cl_next_free_rec, 1); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0eaa929..03a799f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2472,7 +2472,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->slot_recovery_generations); /* FIXME * This belongs in journal shutdown, but because we have to - * allocate osb->journal at the start of ocfs2_initalize_osb(), + * allocate osb->journal at the start of ocfs2_initialize_osb(), * we free it here. */ kfree(osb->journal); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e97b348..d03469f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, struct ocfs2_xattr_value_buf *vb, struct ocfs2_xattr_set_ctxt *ctxt) { - int status = 0; + int status = 0, credits; handle_t *handle = ctxt->handle; enum ocfs2_alloc_restarted why; u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); @@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); - status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } + while (clusters_to_add) { + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + break; + } - prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); - status = ocfs2_add_clusters_in_btree(handle, - &et, - &logical_start, - clusters_to_add, - 0, - ctxt->data_ac, - ctxt->meta_ac, - &why); - if (status < 0) { - mlog_errno(status); - goto leave; - } + prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(handle, + &et, + &logical_start, + clusters_to_add, + 0, + ctxt->data_ac, + ctxt->meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + break; + } - ocfs2_journal_dirty(handle, vb->vb_bh); + ocfs2_journal_dirty(handle, vb->vb_bh); - clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; + clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - + prev_clusters; - /* - * We should have already allocated enough space before the transaction, - * so no need to restart. - */ - BUG_ON(why != RESTART_NONE || clusters_to_add); - -leave: + if (why != RESTART_NONE && clusters_to_add) { + /* + * We can only fail in case the alloc file doesn't give + * up enough clusters. + */ + BUG_ON(why == RESTART_META); + + mlog(0, "restarting xattr value extension for %u" + " clusters,.\n", clusters_to_add); + credits = ocfs2_calc_extend_credits(inode->i_sb, + &vb->vb_xv->xr_list, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + status = -ENOMEM; + mlog_errno(status); + break; + } + } + } return status; } @@ -6788,16 +6804,15 @@ out: return ret; } -static int ocfs2_reflink_xattr_buckets(handle_t *handle, +static int ocfs2_reflink_xattr_bucket(handle_t *handle, u64 blkno, u64 new_blkno, u32 clusters, + u32 *cpos, int num_buckets, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac, struct ocfs2_reflink_xattr_tree_args *args) { int i, j, ret = 0; struct super_block *sb = args->reflink->old_inode->i_sb; - u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); - u32 num_buckets = clusters * bpc; int bpb = args->old_bucket->bu_blocks; struct ocfs2_xattr_value_buf vb = { .vb_access = ocfs2_journal_access, @@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, break; } - /* - * The real bucket num in this series of blocks is stored - * in the 1st bucket. - */ - if (i == 0) - num_buckets = le16_to_cpu( - bucket_xh(args->old_bucket)->xh_num_buckets); - ret = ocfs2_xattr_bucket_journal_access(handle, args->new_bucket, OCFS2_JOURNAL_ACCESS_CREATE); @@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, bucket_block(args->old_bucket, j), sb->s_blocksize); + /* + * Record the start cpos so that we can use it to initialize + * our xattr tree we also set the xh_num_bucket for the new + * bucket. + */ + if (i == 0) { + *cpos = le32_to_cpu(bucket_xh(args->new_bucket)-> + xh_entries[0].xe_name_hash); + bucket_xh(args->new_bucket)->xh_num_buckets = + cpu_to_le16(num_buckets); + } + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); ret = ocfs2_reflink_xattr_header(handle, args->reflink, @@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, } ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + ocfs2_xattr_bucket_relse(args->old_bucket); ocfs2_xattr_bucket_relse(args->new_bucket); } @@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, ocfs2_xattr_bucket_relse(args->new_bucket); return ret; } + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + struct inode *inode, + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + u64 blkno, u32 cpos, u32 len) +{ + int ret, first_inserted = 0; + u32 p_cluster, num_clusters, reflink_cpos = 0; + u64 new_blkno; + unsigned int num_buckets, reflink_buckets; + unsigned int bpc = + ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets); + ocfs2_xattr_bucket_relse(args->old_bucket); + + while (len && num_buckets) { + ret = ocfs2_claim_clusters(handle, data_ac, + 1, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + reflink_buckets = min(num_buckets, bpc * num_clusters); + + ret = ocfs2_reflink_xattr_bucket(handle, blkno, + new_blkno, num_clusters, + &reflink_cpos, reflink_buckets, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * For the 1st allocated cluster, we make it use the same cpos + * so that the xattr tree looks the same as the original one + * in the most case. + */ + if (!first_inserted) { + reflink_cpos = cpos; + first_inserted = 1; + } + ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno, + num_clusters, 0, meta_ac); + if (ret) + mlog_errno(ret); + + mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", + (unsigned long long)new_blkno, num_clusters, reflink_cpos); + + len -= num_clusters; + blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + num_buckets -= reflink_buckets; + } +out: + return ret; +} + /* * Create the same xattr extent record in the new inode's xattr tree. */ @@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, void *para) { int ret, credits = 0; - u32 p_cluster, num_clusters; - u64 new_blkno; handle_t *handle; struct ocfs2_reflink_xattr_tree_args *args = (struct ocfs2_reflink_xattr_tree_args *)para; @@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_extent_tree et; + mlog(0, "reflink xattr buckets %llu len %u\n", + (unsigned long long)blkno, len); + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(args->reflink->new_inode), args->new_blk_bh); @@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, goto out; } - ret = ocfs2_claim_clusters(handle, data_ac, - len, &p_cluster, &num_clusters); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster); - - mlog(0, "reflink xattr buckets %llu to %llu, len %u\n", - (unsigned long long)blkno, (unsigned long long)new_blkno, len); - ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len, - meta_ac, data_ac, args); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", - (unsigned long long)new_blkno, len, cpos); - ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno, - len, 0, meta_ac); + ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et, + meta_ac, data_ac, + blkno, cpos, len); if (ret) mlog_errno(ret); -out_commit: ocfs2_commit_trans(osb, handle); out: @@ -110,7 +110,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) - error = security_path_truncate(&path, length, 0); + error = security_path_truncate(&path); if (!error) error = do_truncate(path.dentry, length, 0, NULL); @@ -165,8 +165,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) error = locks_verify_truncate(inode, file, length); if (!error) - error = security_path_truncate(&file->f_path, length, - ATTR_MTIME|ATTR_CTIME); + error = security_path_truncate(&file->f_path); if (!error) error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); out_putf: @@ -367,7 +366,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename) if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -396,7 +395,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) if (!S_ISDIR(inode->i_mode)) goto out_putf; - error = inode_permission(inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &file->f_path); out_putf: @@ -414,7 +413,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 5dcd4b0..72c5265 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -459,7 +459,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ - INIT_RCU_HEAD(&p->rcu_head); rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk supresses it */ diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 3e73de5..fc84976 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state) } *label; unsigned char *data; Sector sect; + sector_t labelsect; res = 0; blocksize = bdev_logical_block_size(bdev); @@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state) goto out_freeall; /* + * Special case for FBA disks: label sector does not depend on + * blocksize. + */ + if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || + (info->cu_type == 0x3880 && info->dev_type == 0x3370)) + labelsect = info->label_block; + else + labelsect = info->label_block * (blocksize >> 9); + + /* * Get volume label, extract name and type. */ - data = read_part_sector(state, info->label_block*(blocksize/512), - §); + data = read_part_sector(state, labelsect, §); if (data == NULL) goto out_readerr; diff --git a/fs/proc/array.c b/fs/proc/array.c index 9b58d38..fff6572 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (tracer) tpid = task_pid_nr_ns(tracer, ns); } - cred = get_cred((struct cred *) __task_cred(p)); + cred = get_task_cred(p); seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 12c233d..ef72b16 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -132,6 +132,22 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); +void __quota_error(struct super_block *sb, const char *func, + const char *fmt, ...) +{ + va_list args; + + if (printk_ratelimit()) { + va_start(args, fmt); + printk(KERN_ERR "Quota error (device %s): %s: ", + sb->s_id, func); + vprintk(fmt, args); + printk("\n"); + va_end(args); + } +} +EXPORT_SYMBOL(__quota_error); + #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) static char *quotatypes[] = INITQFNAMES; #endif @@ -676,7 +692,7 @@ static void prune_dqcache(int count) * This is called from kswapd when we think we need some * more memory */ -static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { spin_lock(&dq_list_lock); @@ -705,11 +721,8 @@ void dqput(struct dquot *dquot) return; #ifdef CONFIG_QUOTA_DEBUG if (!atomic_read(&dquot->dq_count)) { - printk("VFS: dqput: trying to free free dquot\n"); - printk("VFS: device %s, dquot of %s %d\n", - dquot->dq_sb->s_id, - quotatypes[dquot->dq_type], - dquot->dq_id); + quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", + quotatypes[dquot->dq_type], dquot->dq_id); BUG(); } #endif @@ -732,9 +745,9 @@ we_slept: /* Commit dquot before releasing */ ret = dquot->dq_sb->dq_op->write_dquot(dquot); if (ret < 0) { - printk(KERN_ERR "VFS: cannot write quota structure on " - "device %s (error %d). Quota may get out of " - "sync!\n", dquot->dq_sb->s_id, ret); + quota_error(dquot->dq_sb, "Can't write quota structure" + " (error %d). Quota may get out of sync!", + ret); /* * We clear dirty bit anyway, so that we avoid * infinite loop here @@ -914,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type) #ifdef CONFIG_QUOTA_DEBUG if (reserved) { - printk(KERN_WARNING "VFS (%s): Writes happened before quota" - " was turned on thus quota information is probably " - "inconsistent. Please run quotacheck(8).\n", sb->s_id); + quota_error(sb, "Writes happened before quota was turned on " + "thus quota information is probably inconsistent. " + "Please run quotacheck(8)"); } #endif } @@ -947,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type, if (dqput_blocks(dquot)) { #ifdef CONFIG_QUOTA_DEBUG if (atomic_read(&dquot->dq_count) != 1) - printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); + quota_error(inode->i_sb, "Adding dquot with " + "dq_count %d to dispose list", + atomic_read(&dquot->dq_count)); #endif spin_lock(&dq_list_lock); /* As dquot must have currently users it can't be on @@ -986,6 +1001,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head) { struct inode *inode; + int reserved = 0; spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { @@ -995,10 +1011,20 @@ static void remove_dquot_ref(struct super_block *sb, int type, * only quota pointers and these have separate locking * (dqptr_sem). */ - if (!IS_NOQUOTA(inode)) + if (!IS_NOQUOTA(inode)) { + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; remove_inode_dquot_ref(inode, type, tofree_head); + } } spin_unlock(&inode_lock); +#ifdef CONFIG_QUOTA_DEBUG + if (reserved) { + printk(KERN_WARNING "VFS (%s): Writes happened after quota" + " was disabled thus quota information is probably " + "inconsistent. Please run quotacheck(8).\n", sb->s_id); + } +#endif } /* Gather all references from inodes and drop them */ @@ -1304,6 +1330,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) return QUOTA_NL_NOWARN; } +static int dquot_active(const struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (IS_NOQUOTA(inode)) + return 0; + return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); +} + /* * Initialize quota pointers in inode * @@ -1323,7 +1358,7 @@ static void __dquot_initialize(struct inode *inode, int type) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + if (!dquot_active(inode)) return; /* First get references to structures we might need. */ @@ -1507,7 +1542,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) * First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { + if (!dquot_active(inode)) { inode_incr_space(inode, number, reserve); goto out; } @@ -1559,7 +1594,7 @@ int dquot_alloc_inode(const struct inode *inode) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + if (!dquot_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1596,7 +1631,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { int cnt; - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { + if (!dquot_active(inode)) { inode_claim_rsv_space(inode, number); return 0; } @@ -1629,7 +1664,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { + if (!dquot_active(inode)) { inode_decr_space(inode, number, reserve); return; } @@ -1667,7 +1702,7 @@ void dquot_free_inode(const struct inode *inode) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + if (!dquot_active(inode)) return; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1790,7 +1825,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) struct super_block *sb = inode->i_sb; int ret; - if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode)) + if (!dquot_active(inode)) return 0; if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) @@ -1957,7 +1992,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) truncate_inode_pages(&toputinode[cnt]->i_data, 0); mutex_unlock(&toputinode[cnt]->i_mutex); - mark_inode_dirty(toputinode[cnt]); + mark_inode_dirty_sync(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); } @@ -2270,7 +2305,7 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) memset(di, 0, sizeof(*di)); di->d_version = FS_DQUOT_VERSION; di->d_flags = dquot->dq_type == USRQUOTA ? - XFS_USER_QUOTA : XFS_GROUP_QUOTA; + FS_USER_QUOTA : FS_GROUP_QUOTA; di->d_id = dquot->dq_id; spin_lock(&dq_data_lock); diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 24f0340..9e48874 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -65,8 +65,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) ret = sb->s_op->quota_write(sb, info->dqi_type, buf, info->dqi_usable_bs, blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { - q_warn(KERN_WARNING "VFS: dquota write failed on " - "dev %s\n", sb->s_id); + quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -EIO; } @@ -160,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); /* No matter whether write succeeds block is out of list */ if (write_blk(info, blk, buf) < 0) - q_warn(KERN_ERR - "VFS: Can't write block (%u) with free entries.\n", - blk); + quota_error(info->dqi_sb, "Can't write block (%u) " + "with free entries", blk); return 0; out_buf: kfree(tmpbuf); @@ -252,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { *err = remove_free_dqentry(info, buf, blk); if (*err < 0) { - q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't " - "remove block (%u) from entry free list.\n", - blk); + quota_error(dquot->dq_sb, "Can't remove block (%u) " + "from entry free list", blk); goto out_buf; } } @@ -268,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, } #ifdef __QUOTA_QT_PARANOIA if (i == qtree_dqstr_in_blk(info)) { - printk(KERN_ERR "VFS: find_free_dqentry(): Data block full " - "but it shouldn't.\n"); + quota_error(dquot->dq_sb, "Data block full but it shouldn't"); *err = -EIO; goto out_buf; } #endif *err = write_blk(info, blk, buf); if (*err < 0) { - q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " - "data block %u.\n", blk); + quota_error(dquot->dq_sb, "Can't write quota data block %u", + blk); goto out_buf; } dquot->dq_off = (blk << info->dqi_blocksize_bits) + @@ -311,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } else { ret = read_blk(info, *treeblk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read tree quota block " - "%u.\n", *treeblk); + quota_error(dquot->dq_sb, "Can't read tree quota " + "block %u", *treeblk); goto out_buf; } } @@ -323,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (depth == info->dqi_qtree_depth - 1) { #ifdef __QUOTA_QT_PARANOIA if (newblk) { - printk(KERN_ERR "VFS: Inserting already present quota " - "entry (block %u).\n", - le32_to_cpu(ref[get_index(info, + quota_error(dquot->dq_sb, "Inserting already present " + "quota entry (block %u)", + le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)])); ret = -EIO; goto out_buf; @@ -373,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { - q_warn(KERN_ERR "VFS: Error %zd occurred while " - "creating quota.\n", ret); + quota_error(sb, "Error %zd occurred while creating " + "quota", ret); kfree(ddquot); return ret; } @@ -385,8 +381,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { - q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n", - sb->s_id); + quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -ENOSPC; } else { @@ -410,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (!buf) return -ENOMEM; if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { - q_warn(KERN_ERR "VFS: Quota structure has offset to other " - "block (%u) than it should (%u).\n", blk, - (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + quota_error(dquot->dq_sb, "Quota structure has offset to " + "other block (%u) than it should (%u)", blk, + (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); goto out_buf; } ret = read_blk(info, blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk); + quota_error(dquot->dq_sb, "Can't read quota data block %u", + blk); goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; @@ -427,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (ret >= 0) ret = put_free_dqblk(info, buf, blk); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't move quota data block (%u) " - "to free list.\n", blk); + quota_error(dquot->dq_sb, "Can't move quota data block " + "(%u) to free list", blk); goto out_buf; } } else { @@ -440,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, /* Insert will write block itself */ ret = insert_free_dqentry(info, buf, blk); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't insert quota data " - "block (%u) to free entry list.\n", blk); + quota_error(dquot->dq_sb, "Can't insert quota " + "data block (%u) to free entry list", blk); goto out_buf; } } else { ret = write_blk(info, blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't write quota data " - "block %u\n", blk); + quota_error(dquot->dq_sb, "Can't write quota " + "data block %u", blk); goto out_buf; } } @@ -472,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, return -ENOMEM; ret = read_blk(info, *blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); + quota_error(dquot->dq_sb, "Can't read quota data " + "block %u", blk); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); @@ -496,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } else { ret = write_blk(info, *blk, buf); if (ret < 0) - q_warn(KERN_ERR "VFS: Can't write quota tree " - "block %u.\n", *blk); + quota_error(dquot->dq_sb, "Can't write quota " + "tree block %u", blk); } } out_buf: @@ -529,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + quota_error(dquot->dq_sb, "Can't read quota tree " + "block %u", blk); goto out_buf; } ddquot = buf + sizeof(struct qt_disk_dqdbheader); @@ -539,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, ddquot += info->dqi_entry_size; } if (i == qtree_dqstr_in_blk(info)) { - q_warn(KERN_ERR "VFS: Quota for id %u referenced " - "but not present.\n", dquot->dq_id); + quota_error(dquot->dq_sb, "Quota for id %u referenced " + "but not present", dquot->dq_id); ret = -EIO; goto out_buf; } else { @@ -564,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + quota_error(dquot->dq_sb, "Can't read quota tree block %u", + blk); goto out_buf; } ret = 0; @@ -598,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) #ifdef __QUOTA_QT_PARANOIA /* Invalidated quota? */ if (!sb_dqopt(dquot->dq_sb)->files[type]) { - printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); + quota_error(sb, "Quota invalidated while reading!"); return -EIO; } #endif @@ -607,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) offset = find_dqentry(info, dquot); if (offset <= 0) { /* Entry not present? */ if (offset < 0) - q_warn(KERN_ERR "VFS: Can't read quota " - "structure for id %u.\n", dquot->dq_id); + quota_error(sb, "Can't read quota structure " + "for id %u", dquot->dq_id); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); @@ -625,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (ret != info->dqi_entry_size) { if (ret >= 0) ret = -EIO; - q_warn(KERN_ERR "VFS: Error while reading quota " - "structure for id %u.\n", dquot->dq_id); + quota_error(sb, "Error while reading quota structure for id %u", + dquot->dq_id); set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); kfree(ddquot); diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h index ccc3e71..a1ab8db 100644 --- a/fs/quota/quota_tree.h +++ b/fs/quota/quota_tree.h @@ -22,10 +22,4 @@ struct qt_disk_dqdbheader { #define QT_TREEOFF 1 /* Offset of tree in file in blocks */ -#define q_warn(fmt, args...) \ -do { \ - if (printk_ratelimit()) \ - printk(fmt, ## args); \ -} while(0) - #endif /* _LINUX_QUOTAIO_TREE_H */ diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 4af344c..34b37a6 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot) (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); if (ret != sizeof(struct v1_disk_dqblk)) { - printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", - dquot->dq_sb->s_id); + quota_error(dquot->dq_sb, "dquota write failed"); if (ret >= 0) ret = -EIO; goto out; diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 135206a..65444d2 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type, size = sb->s_op->quota_read(sb, type, (char *)dqhead, sizeof(struct v2_disk_dqheader), 0); if (size != sizeof(struct v2_disk_dqheader)) { - q_warn(KERN_WARNING "quota_v2: Failed header read:" - " expected=%zd got=%zd\n", - sizeof(struct v2_disk_dqheader), size); + quota_error(sb, "Failed header read: expected=%zd got=%zd", + sizeof(struct v2_disk_dqheader), size); return 0; } return 1; @@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type) size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { - q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", - sb->s_id); + quota_error(sb, "Can't read info structure"); return -1; } info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); @@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type) size = sb->s_op->quota_write(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { - q_warn(KERN_WARNING "Can't write info structure on device %s.\n", - sb->s_id); + quota_error(sb, "Can't write info structure"); return -1; } return 0; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0f22fda..29db72203 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1221,7 +1221,7 @@ static void init_inode(struct inode *inode, struct treepath *path) inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, SD_V2_SIZE)); - /* read persistent inode attributes from sd and initalise + /* read persistent inode attributes from sd and initialise generic inode flags from them */ REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); diff --git a/fs/splice.c b/fs/splice.c index 740e6b9..efdbfec 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1282,7 +1282,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; - return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); + return do_splice_from(pipe, file, &file->f_pos, sd->total_len, + sd->flags); } /** @@ -1371,8 +1372,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_in) return -ESPIPE; if (off_out) { - if (!out->f_op || !out->f_op->llseek || - out->f_op->llseek == no_llseek) + if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; @@ -1392,8 +1392,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_out) return -ESPIPE; if (off_in) { - if (!in->f_op || !in->f_op->llseek || - in->f_op->llseek == no_llseek) + if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1beaa73..1b27b56 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -593,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); * @mode: file permissions. * */ -int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) +int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, + mode_t mode) { struct sysfs_dirent *sd; struct iattr newattrs; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index f71246b..a7ac78f 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -28,6 +28,7 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; + enum kobj_ns_type ns_type; int error; BUG_ON(!name); @@ -58,16 +59,29 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, if (!sd) goto out_put; - if (sysfs_ns_type(parent_sd)) + ns_type = sysfs_ns_type(parent_sd); + if (ns_type) sd->s_ns = target->ktype->namespace(target); sd->s_symlink.target_sd = target_sd; target_sd = NULL; /* reference is now owned by the symlink */ sysfs_addrm_start(&acxt, parent_sd); - if (warn) - error = sysfs_add_one(&acxt, sd); - else - error = __sysfs_add_one(&acxt, sd); + /* Symlinks must be between directories with the same ns_type */ + if (!ns_type || + (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { + if (warn) + error = sysfs_add_one(&acxt, sd); + else + error = __sysfs_add_one(&acxt, sd); + } else { + error = -EINVAL; + WARN(1, KERN_WARNING + "sysfs: symlink across ns_types %s/%s -> %s/%s\n", + parent_sd->s_name, + sd->s_name, + sd->s_symlink.target_sd->s_parent->s_name, + sd->s_symlink.target_sd->s_name); + } sysfs_addrm_finish(&acxt); if (error) @@ -122,7 +136,7 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, { const void *ns = NULL; spin_lock(&sysfs_assoc_lock); - if (targ->sd) + if (targ->sd && sysfs_ns_type(kobj->sd)) ns = targ->sd->s_ns; spin_unlock(&sysfs_assoc_lock); sysfs_hash_and_remove(kobj->sd, ns, name); diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index ad7f67b..0084a33 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1457,13 +1457,13 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); iip = (i & (UBIFS_LPT_FANOUT - 1)); dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, pnode->lprops[iip].free, pnode->lprops[iip].dirty, @@ -1586,7 +1586,7 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) nnode = c->nroot; nnode = dirty_cow_nnode(c, nnode); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); i = lnum - c->main_first; shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; for (h = 1; h < c->lpt_hght; h++) { @@ -1594,19 +1594,19 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); nnode = dirty_cow_nnode(c, nnode); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); shft -= UBIFS_LPT_FANOUT_SHIFT; pnode = ubifs_get_pnode(c, nnode, iip); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); pnode = dirty_cow_pnode(c, pnode); if (IS_ERR(pnode)) - return ERR_PTR(PTR_ERR(pnode)); + return ERR_CAST(pnode); iip = (i & (UBIFS_LPT_FANOUT - 1)); dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, pnode->lprops[iip].free, pnode->lprops[iip].dirty, diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 13cb7a4..d12535b 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -646,7 +646,7 @@ static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i) shft -= UBIFS_LPT_FANOUT_SHIFT; nnode = ubifs_get_nnode(c, nnode, iip); if (IS_ERR(nnode)) - return ERR_PTR(PTR_ERR(nnode)); + return ERR_CAST(nnode); } iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); return ubifs_get_pnode(c, nnode, iip); diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 109c6ea..daae9e1 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -24,7 +24,7 @@ * This file implements functions needed to recover from unclean un-mounts. * When UBIFS is mounted, it checks a flag on the master node to determine if * an un-mount was completed successfully. If not, the process of mounting - * incorparates additional checking and fixing of on-flash data structures. + * incorporates additional checking and fixing of on-flash data structures. * UBIFS always cleans away all remnants of an unclean un-mount, so that * errors do not accumulate. However UBIFS defers recovery if it is mounted * read-only, and the flash is not modified in that case. @@ -1063,8 +1063,21 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) } err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); if (err) { - if (err == -ENOSPC) - dbg_err("could not find a dirty LEB"); + /* + * There are no dirty or empty LEBs subject to here being + * enough for the index. Try to use + * 'ubifs_find_free_leb_for_idx()', which will return any empty + * LEBs (ignoring index requirements). If the index then + * doesn't have enough LEBs the recovery commit will fail - + * which is the same result anyway i.e. recovery fails. So + * there is no problem ignoring index requirements and just + * grabbing a free LEB since we have already established there + * is not a dirty LEB we could have used instead. + */ + if (err == -ENOSPC) { + dbg_rcvry("could not find a dirty LEB"); + goto find_free; + } return err; } ubifs_assert(!(lp.flags & LPROPS_INDEX)); @@ -1139,8 +1152,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) find_free: /* * There is no GC head LEB or the free space in the GC head LEB is too - * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so - * GC is not run. + * small, or there are not dirty LEBs. Allocate gc_lnum by calling + * 'ubifs_find_free_leb_for_idx()' so GC is not run. */ lnum = ubifs_find_free_leb_for_idx(c); if (lnum < 0) { diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 02feb59..0b20111 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,7 +277,7 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(int nr, gfp_t gfp_mask) +int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) { int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 4d2f215..5fc5a09 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1307,6 +1307,8 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_orphans; err = ubifs_rcvry_gc_commit(c); + if (err) + goto out_orphans; } else { err = take_gc_lnum(c); if (err) @@ -1318,7 +1320,7 @@ static int mount_ubifs(struct ubifs_info *c) */ err = ubifs_leb_unmap(c, c->gc_lnum); if (err) - return err; + goto out_orphans; } err = dbg_check_lprops(c); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 2eef553..0431087 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); +int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); /* commit.c */ int ubifs_bg_thread(void *info); diff --git a/fs/udf/file.c b/fs/udf/file.c index 94e06d6..6e450e0 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -36,7 +36,6 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/aio.h> -#include <linux/smp_lock.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/super.c b/fs/udf/super.c index 612d1e2..12bb651 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1579,9 +1579,7 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, { struct anchorVolDescPtr *anchor; long main_s, main_e, reserve_s, reserve_e; - struct udf_sb_info *sbi; - sbi = UDF_SB(sb); anchor = (struct anchorVolDescPtr *)bh->b_data; /* Locate the main sequence */ diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index c8fb13f..0dce969 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -87,11 +87,9 @@ xfs-y += xfs_alloc.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ - xfs_trans_item.o \ xfs_utils.o \ xfs_vnodeops.o \ - xfs_rw.o \ - xfs_dmops.o + xfs_rw.o xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 9f769b5..b277186 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -225,7 +225,7 @@ xfs_check_acl(struct inode *inode, int mask) struct posix_acl *acl; int error = -EAGAIN; - xfs_itrace_entry(ip); + trace_xfs_check_acl(ip); /* * If there is no attribute fork no ACL exists on this inode and diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 34640d6..d24e78f 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -21,19 +21,12 @@ #include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_trans.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_iomap.h" @@ -92,18 +85,15 @@ void xfs_count_page_state( struct page *page, int *delalloc, - int *unmapped, int *unwritten) { struct buffer_head *bh, *head; - *delalloc = *unmapped = *unwritten = 0; + *delalloc = *unwritten = 0; bh = head = page_buffers(page); do { - if (buffer_uptodate(bh) && !buffer_mapped(bh)) - (*unmapped) = 1; - else if (buffer_unwritten(bh)) + if (buffer_unwritten(bh)) (*unwritten) = 1; else if (buffer_delay(bh)) (*delalloc) = 1; @@ -212,23 +202,17 @@ xfs_setfilesize( } /* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. + * Schedule IO completion handling on the final put of an ioend. */ STATIC void xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) + struct xfs_ioend *ioend) { if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq; - - wq = (ioend->io_type == IO_UNWRITTEN) ? - xfsconvertd_workqueue : xfsdatad_workqueue; - queue_work(wq, &ioend->io_work); - if (wait) - flush_workqueue(wq); + if (ioend->io_type == IO_UNWRITTEN) + queue_work(xfsconvertd_workqueue, &ioend->io_work); + else + queue_work(xfsdatad_workqueue, &ioend->io_work); } } @@ -272,11 +256,25 @@ xfs_end_io( */ if (error == EAGAIN) { atomic_inc(&ioend->io_remaining); - xfs_finish_ioend(ioend, 0); + xfs_finish_ioend(ioend); /* ensure we don't spin on blocked ioends */ delay(1); - } else + } else { + if (ioend->io_iocb) + aio_complete(ioend->io_iocb, ioend->io_result, 0); xfs_destroy_ioend(ioend); + } +} + +/* + * Call IO completion handling in caller context on the final put of an ioend. + */ +STATIC void +xfs_finish_ioend_sync( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) + xfs_end_io(&ioend->io_work); } /* @@ -309,6 +307,8 @@ xfs_alloc_ioend( atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; + ioend->io_iocb = NULL; + ioend->io_result = 0; INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; @@ -358,7 +358,7 @@ xfs_end_bio( bio->bi_end_io = NULL; bio_put(bio); - xfs_finish_ioend(ioend, 0); + xfs_finish_ioend(ioend); } STATIC void @@ -500,7 +500,7 @@ xfs_submit_ioend( } if (bio) xfs_submit_ioend_bio(wbc, ioend, bio); - xfs_finish_ioend(ioend, 0); + xfs_finish_ioend(ioend); } while ((ioend = next) != NULL); } @@ -614,31 +614,30 @@ xfs_map_at_offset( STATIC unsigned int xfs_probe_page( struct page *page, - unsigned int pg_offset, - int mapped) + unsigned int pg_offset) { + struct buffer_head *bh, *head; int ret = 0; if (PageWriteback(page)) return 0; + if (!PageDirty(page)) + return 0; + if (!page->mapping) + return 0; + if (!page_has_buffers(page)) + return 0; - if (page->mapping && PageDirty(page)) { - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - if (!buffer_uptodate(bh)) - break; - if (mapped != buffer_mapped(bh)) - break; - ret += bh->b_size; - if (ret >= pg_offset) - break; - } while ((bh = bh->b_this_page) != head); - } else - ret = mapped ? 0 : PAGE_CACHE_SIZE; - } + bh = head = page_buffers(page); + do { + if (!buffer_uptodate(bh)) + break; + if (!buffer_mapped(bh)) + break; + ret += bh->b_size; + if (ret >= pg_offset) + break; + } while ((bh = bh->b_this_page) != head); return ret; } @@ -648,8 +647,7 @@ xfs_probe_cluster( struct inode *inode, struct page *startpage, struct buffer_head *bh, - struct buffer_head *head, - int mapped) + struct buffer_head *head) { struct pagevec pvec; pgoff_t tindex, tlast, tloff; @@ -658,7 +656,7 @@ xfs_probe_cluster( /* First sum forwards in this page */ do { - if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) + if (!buffer_uptodate(bh) || !buffer_mapped(bh)) return total; total += bh->b_size; } while ((bh = bh->b_this_page) != head); @@ -692,7 +690,7 @@ xfs_probe_cluster( pg_offset = PAGE_CACHE_SIZE; if (page->index == tindex && trylock_page(page)) { - pg_len = xfs_probe_page(page, pg_offset, mapped); + pg_len = xfs_probe_page(page, pg_offset); unlock_page(page); } @@ -761,7 +759,6 @@ xfs_convert_page( struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int startio, int all_bh) { struct buffer_head *bh, *head; @@ -832,19 +829,14 @@ xfs_convert_page( ASSERT(imap->br_startblock != DELAYSTARTBLOCK); xfs_map_at_offset(inode, bh, imap, offset); - if (startio) { - xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } + xfs_add_to_ioend(inode, bh, offset, type, + ioendp, done); + page_dirty--; count++; } else { type = IO_NEW; - if (buffer_mapped(bh) && all_bh && startio) { + if (buffer_mapped(bh) && all_bh) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -859,14 +851,12 @@ xfs_convert_page( if (uptodate && bh == head) SetPageUptodate(page); - if (startio) { - if (count) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) - done = 1; - } - xfs_start_page_writeback(page, !page_dirty, count); + if (count) { + wbc->nr_to_write--; + if (wbc->nr_to_write <= 0) + done = 1; } + xfs_start_page_writeback(page, !page_dirty, count); return done; fail_unlock_page: @@ -886,7 +876,6 @@ xfs_cluster_write( struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, - int startio, int all_bh, pgoff_t tlast) { @@ -902,7 +891,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - imap, ioendp, wbc, startio, all_bh); + imap, ioendp, wbc, all_bh); if (done) break; } @@ -981,7 +970,7 @@ xfs_aops_discard_page( */ error = xfs_bmapi(NULL, ip, offset_fsb, 1, XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL, NULL); + &nimaps, NULL); if (error) { /* something screwed, just bail */ @@ -1009,7 +998,7 @@ xfs_aops_discard_page( */ xfs_bmap_init(&flist, &firstblock); error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, - &flist, NULL, &done); + &flist, &done); ASSERT(!flist.xbf_count && !flist.xbf_first); if (error) { @@ -1032,50 +1021,66 @@ out_invalidate: } /* - * Calling this without startio set means we are being asked to make a dirty - * page ready for freeing it's buffers. When called with startio set then - * we are coming from writepage. + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + * For any other dirty buffer heads on the page we should flush them. * - * When called with startio set it is important that we write the WHOLE - * page if possible. - * The bh->b_state's cannot know if any of the blocks or which block for - * that matter are dirty due to mmap writes, and therefore bh uptodate is - * only valid if the page itself isn't completely uptodate. Some layers - * may clear the page dirty flag prior to calling write page, under the - * assumption the entire page will be written out; by not writing out the - * whole page the page can be reused before all valid dirty data is - * written out. Note: in the case of a page that has been dirty'd by - * mapwrite and but partially setup by block_prepare_write the - * bh->b_states's will not agree and only ones setup by BPW/BCW will have - * valid state, thus the whole page must be written out thing. + * If we detect that a transaction would be required to flush the page, we + * have to check the process flags first, if we are already in a transaction + * or disk I/O during allocations is off, we need to fail the writepage and + * redirty the page. */ - STATIC int -xfs_page_state_convert( - struct inode *inode, - struct page *page, - struct writeback_control *wbc, - int startio, - int unmapped) /* also implies page uptodate */ +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) { + struct inode *inode = page->mapping->host; + int delalloc, unwritten; struct buffer_head *bh, *head; struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; - unsigned long p_offset = 0; unsigned int type; __uint64_t end_offset; pgoff_t end_index, last_index; ssize_t size, len; int flags, err, imap_valid = 0, uptodate = 1; - int page_dirty, count = 0; - int trylock = 0; - int all_bh = unmapped; + int count = 0; + int all_bh = 0; - if (startio) { - if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) - trylock |= BMAPI_TRYLOCK; - } + trace_xfs_writepage(inode, page, 0); + + ASSERT(page_has_buffers(page)); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should really be done by the core VM, but until that happens + * filesystems like XFS, btrfs and ext4 have to take care of this + * by themselves. + */ + if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) + goto out_fail; + + /* + * We need a transaction if there are delalloc or unwritten buffers + * on the page. + * + * If we need a transaction and the process flags say we are already + * in a transaction, or no IO is allowed then mark the page dirty + * again and leave the page as is. + */ + xfs_count_page_state(page, &delalloc, &unwritten); + if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) + goto out_fail; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -1084,50 +1089,33 @@ xfs_page_state_convert( if (page->index >= end_index) { if ((page->index >= end_index + 1) || !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { - if (startio) - unlock_page(page); + unlock_page(page); return 0; } } - /* - * page_dirty is initially a count of buffers on the page before - * EOF and is decremented as we move each into a cleanable state. - * - * Derivation: - * - * End offset is the highest offset that this page should represent. - * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) - * will evaluate non-zero and be less than PAGE_CACHE_SIZE and - * hence give us the correct page_dirty count. On any other page, - * it will be zero and in that case we need page_dirty to be the - * count of buffers on the page. - */ end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + offset); len = 1 << inode->i_blkbits; - p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), - PAGE_CACHE_SIZE); - p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; - page_dirty = p_offset / len; bh = head = page_buffers(page); offset = page_offset(page); flags = BMAPI_READ; type = IO_NEW; - /* TODO: cleanup count and page_dirty */ - do { if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; - if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) { - /* - * the iomap is actually still valid, but the ioend - * isn't. shouldn't happen too often. - */ + + /* + * A hole may still be marked uptodate because discard_buffer + * leaves the flag set. + */ + if (!buffer_mapped(bh) && buffer_uptodate(bh)) { + ASSERT(!buffer_dirty(bh)); imap_valid = 0; continue; } @@ -1135,19 +1123,7 @@ xfs_page_state_convert( if (imap_valid) imap_valid = xfs_imap_valid(inode, &imap, offset); - /* - * First case, map an unwritten extent and prepare for - * extent state conversion transaction on completion. - * - * Second case, allocate space for a delalloc buffer. - * We can return EAGAIN here in the release page case. - * - * Third case, an unmapped buffer was found, and we are - * in a path where we need to write the whole page out. - */ - if (buffer_unwritten(bh) || buffer_delay(bh) || - ((buffer_uptodate(bh) || PageUptodate(page)) && - !buffer_mapped(bh) && (unmapped || startio))) { + if (buffer_unwritten(bh) || buffer_delay(bh)) { int new_ioend = 0; /* @@ -1161,15 +1137,16 @@ xfs_page_state_convert( flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { type = IO_DELAY; - flags = BMAPI_ALLOCATE | trylock; - } else { - type = IO_NEW; - flags = BMAPI_WRITE | BMAPI_MMAP; + flags = BMAPI_ALLOCATE; + + if (wbc->sync_mode == WB_SYNC_NONE && + wbc->nonblocking) + flags |= BMAPI_TRYLOCK; } if (!imap_valid) { /* - * if we didn't have a valid mapping then we + * If we didn't have a valid mapping then we * need to ensure that we put the new mapping * in a new ioend structure. This needs to be * done to ensure that the ioends correctly @@ -1177,14 +1154,7 @@ xfs_page_state_convert( * for unwritten extent conversion. */ new_ioend = 1; - if (type == IO_NEW) { - size = xfs_probe_cluster(inode, - page, bh, head, 0); - } else { - size = len; - } - - err = xfs_map_blocks(inode, offset, size, + err = xfs_map_blocks(inode, offset, len, &imap, flags); if (err) goto error; @@ -1193,19 +1163,11 @@ xfs_page_state_convert( } if (imap_valid) { xfs_map_at_offset(inode, bh, &imap, offset); - if (startio) { - xfs_add_to_ioend(inode, bh, offset, - type, &ioend, - new_ioend); - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } - page_dirty--; + xfs_add_to_ioend(inode, bh, offset, type, + &ioend, new_ioend); count++; } - } else if (buffer_uptodate(bh) && startio) { + } else if (buffer_uptodate(bh)) { /* * we got here because the buffer is already mapped. * That means it must already have extents allocated @@ -1213,8 +1175,7 @@ xfs_page_state_convert( */ if (!imap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; - size = xfs_probe_cluster(inode, page, bh, - head, 1); + size = xfs_probe_cluster(inode, page, bh, head); err = xfs_map_blocks(inode, offset, size, &imap, flags); if (err) @@ -1233,18 +1194,16 @@ xfs_page_state_convert( */ type = IO_NEW; if (trylock_buffer(bh)) { - ASSERT(buffer_mapped(bh)); if (imap_valid) all_bh = 1; xfs_add_to_ioend(inode, bh, offset, type, &ioend, !imap_valid); - page_dirty--; count++; } else { imap_valid = 0; } - } else if ((buffer_uptodate(bh) || PageUptodate(page)) && - (unmapped || startio)) { + } else if (PageUptodate(page)) { + ASSERT(buffer_mapped(bh)); imap_valid = 0; } @@ -1256,8 +1215,7 @@ xfs_page_state_convert( if (uptodate && bh == head) SetPageUptodate(page); - if (startio) - xfs_start_page_writeback(page, 1, count); + xfs_start_page_writeback(page, 1, count); if (ioend && imap_valid) { xfs_off_t end_index; @@ -1275,131 +1233,27 @@ xfs_page_state_convert( end_index = last_index; xfs_cluster_write(inode, page->index + 1, &imap, &ioend, - wbc, startio, all_bh, end_index); + wbc, all_bh, end_index); } if (iohead) xfs_submit_ioend(wbc, iohead); - return page_dirty; + return 0; error: if (iohead) xfs_cancel_ioend(iohead); - /* - * If it's delalloc and we have nowhere to put it, - * throw it away, unless the lower layers told - * us to try again. - */ - if (err != -EAGAIN) { - if (!unmapped) - xfs_aops_discard_page(page); - ClearPageUptodate(page); - } + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); return err; -} - -/* - * writepage: Called from one of two places: - * - * 1. we are flushing a delalloc buffer head. - * - * 2. we are writing out a dirty page. Typically the page dirty - * state is cleared before we get here. In this case is it - * conceivable we have no buffer heads. - * - * For delalloc space on the page we need to allocate space and - * flush it. For unmapped buffer heads on the page we should - * allocate space if the page is uptodate. For any other dirty - * buffer heads on the page we should flush them. - * - * If we detect that a transaction would be required to flush - * the page, we have to check the process flags first, if we - * are already in a transaction or disk I/O during allocations - * is off, we need to fail the writepage and redirty the page. - */ - -STATIC int -xfs_vm_writepage( - struct page *page, - struct writeback_control *wbc) -{ - int error; - int need_trans; - int delalloc, unmapped, unwritten; - struct inode *inode = page->mapping->host; - - trace_xfs_writepage(inode, page, 0); - - /* - * Refuse to write the page out if we are called from reclaim context. - * - * This is primarily to avoid stack overflows when called from deep - * used stacks in random callers for direct reclaim, but disabling - * reclaim for kswap is a nice side-effect as kswapd causes rather - * suboptimal I/O patters, too. - * - * This should really be done by the core VM, but until that happens - * filesystems like XFS, btrfs and ext4 have to take care of this - * by themselves. - */ - if (current->flags & PF_MEMALLOC) - goto out_fail; - - /* - * We need a transaction if: - * 1. There are delalloc buffers on the page - * 2. The page is uptodate and we have unmapped buffers - * 3. The page is uptodate and we have no buffers - * 4. There are unwritten buffers on the page - */ - - if (!page_has_buffers(page)) { - unmapped = 1; - need_trans = 1; - } else { - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - if (!PageUptodate(page)) - unmapped = 0; - need_trans = delalloc + unmapped + unwritten; - } - - /* - * If we need a transaction and the process flags say - * we are already in a transaction, or no IO is allowed - * then mark the page dirty again and leave the page - * as is. - */ - if (current_test_flags(PF_FSTRANS) && need_trans) - goto out_fail; - - /* - * Delay hooking up buffer heads until we have - * made our go/no-go decision. - */ - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); - - /* - * Convert delayed allocate, unwritten or unmapped space - * to real space and flush out to disk. - */ - error = xfs_page_state_convert(inode, page, wbc, 1, unmapped); - if (error == -EAGAIN) - goto out_fail; - if (unlikely(error < 0)) - goto out_unlock; - - return 0; out_fail: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; -out_unlock: - unlock_page(page); - return error; } STATIC int @@ -1413,65 +1267,27 @@ xfs_vm_writepages( /* * Called to move a page into cleanable state - and from there - * to be released. Possibly the page is already clean. We always + * to be released. The page should already be clean. We always * have buffer heads in this call. * - * Returns 0 if the page is ok to release, 1 otherwise. - * - * Possible scenarios are: - * - * 1. We are being called to release a page which has been written - * to via regular I/O. buffer heads will be dirty and possibly - * delalloc. If no delalloc buffer heads in this case then we - * can just return zero. - * - * 2. We are called to release a page which has been written via - * mmap, all we need to do is ensure there is no delalloc - * state in the buffer heads, if not we can let the caller - * free them and we should come back later via writepage. + * Returns 1 if the page is ok to release, 0 otherwise. */ STATIC int xfs_vm_releasepage( struct page *page, gfp_t gfp_mask) { - struct inode *inode = page->mapping->host; - int dirty, delalloc, unmapped, unwritten; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - }; + int delalloc, unwritten; - trace_xfs_releasepage(inode, page, 0); - - if (!page_has_buffers(page)) - return 0; + trace_xfs_releasepage(page->mapping->host, page, 0); - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - if (!delalloc && !unwritten) - goto free_buffers; + xfs_count_page_state(page, &delalloc, &unwritten); - if (!(gfp_mask & __GFP_FS)) + if (WARN_ON(delalloc)) return 0; - - /* If we are already inside a transaction or the thread cannot - * do I/O, we cannot release this page. - */ - if (current_test_flags(PF_FSTRANS)) + if (WARN_ON(unwritten)) return 0; - /* - * Convert delalloc space to real space, do not flush the - * data out to disk, that will be done by the caller. - * Never need to allocate space here - we will always - * come back to writepage in that case. - */ - dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0); - if (dirty == 0 && !unwritten) - goto free_buffers; - return 0; - -free_buffers: return try_to_free_buffers(page); } @@ -1481,9 +1297,9 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - int direct, - bmapi_flags_t flags) + int direct) { + int flags = create ? BMAPI_WRITE : BMAPI_READ; struct xfs_bmbt_irec imap; xfs_off_t offset; ssize_t size; @@ -1498,8 +1314,11 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - error = xfs_iomap(XFS_I(inode), offset, size, - create ? flags : BMAPI_READ, &imap, &nimap, &new); + if (direct && create) + flags |= BMAPI_DIRECT; + + error = xfs_iomap(XFS_I(inode), offset, size, flags, &imap, &nimap, + &new); if (error) return -error; if (nimap == 0) @@ -1579,8 +1398,7 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, - bh_result, create, 0, BMAPI_WRITE); + return __xfs_get_blocks(inode, iblock, bh_result, create, 0); } STATIC int @@ -1590,61 +1408,59 @@ xfs_get_blocks_direct( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, - bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT); + return __xfs_get_blocks(inode, iblock, bh_result, create, 1); } +/* + * Complete a direct I/O write request. + * + * If the private argument is non-NULL __xfs_get_blocks signals us that we + * need to issue a transaction to convert the range from unwritten to written + * extents. In case this is regular synchronous I/O we just call xfs_end_io + * to do this and we are done. But in case this was a successfull AIO + * request this handler is called from interrupt context, from which we + * can't start transactions. In that case offload the I/O completion to + * the workqueues we also use for buffered I/O completion. + */ STATIC void -xfs_end_io_direct( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private, + int ret, + bool is_async) { - xfs_ioend_t *ioend = iocb->private; + struct xfs_ioend *ioend = iocb->private; /* - * Non-NULL private data means we need to issue a transaction to - * convert a range from unwritten to written extents. This needs - * to happen from process context but aio+dio I/O completion - * happens from irq context so we need to defer it to a workqueue. - * This is not necessary for synchronous direct I/O, but we do - * it anyway to keep the code uniform and simpler. - * - * Well, if only it were that simple. Because synchronous direct I/O - * requires extent conversion to occur *before* we return to userspace, - * we have to wait for extent conversion to complete. Look at the - * iocb that has been passed to us to determine if this is AIO or - * not. If it is synchronous, tell xfs_finish_ioend() to kick the - * workqueue and wait for it to complete. - * - * The core direct I/O code might be changed to always call the - * completion handler in the future, in which case all this can - * go away. + * blockdev_direct_IO can return an error even after the I/O + * completion handler was called. Thus we need to protect + * against double-freeing. */ + iocb->private = NULL; + ioend->io_offset = offset; ioend->io_size = size; - if (ioend->io_type == IO_READ) { - xfs_finish_ioend(ioend, 0); - } else if (private && size > 0) { - xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); - } else { + if (private && size > 0) + ioend->io_type = IO_UNWRITTEN; + + if (is_async) { /* - * A direct I/O write ioend starts it's life in unwritten - * state in case they map an unwritten extent. This write - * didn't map an unwritten extent so switch it's completion - * handler. + * If we are converting an unwritten extent we need to delay + * the AIO completion until after the unwrittent extent + * conversion has completed, otherwise do it ASAP. */ - ioend->io_type = IO_NEW; - xfs_finish_ioend(ioend, 0); + if (ioend->io_type == IO_UNWRITTEN) { + ioend->io_iocb = iocb; + ioend->io_result = ret; + } else { + aio_complete(iocb, ret, 0); + } + xfs_finish_ioend(ioend); + } else { + xfs_finish_ioend_sync(ioend); } - - /* - * blockdev_direct_IO can return an error even after the I/O - * completion handler was called. Thus we need to protect - * against double-freeing. - */ - iocb->private = NULL; } STATIC ssize_t @@ -1655,23 +1471,26 @@ xfs_vm_direct_IO( loff_t offset, unsigned long nr_segs) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct block_device *bdev; - ssize_t ret; - - bdev = xfs_find_bdev_for_inode(inode); - - iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? - IO_UNWRITTEN : IO_READ); - - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, - offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct block_device *bdev = xfs_find_bdev_for_inode(inode); + ssize_t ret; + + if (rw & WRITE) { + iocb->private = xfs_alloc_ioend(inode, IO_NEW); + + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + xfs_end_io_direct_write); + if (ret != -EIOCBQUEUED && iocb->private) + xfs_destroy_ioend(iocb->private); + } else { + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, + offset, nr_segs, + xfs_get_blocks_direct, + NULL); + } - if (unlikely(ret != -EIOCBQUEUED && iocb->private)) - xfs_destroy_ioend(iocb->private); return ret; } @@ -1686,8 +1505,8 @@ xfs_vm_write_begin( void **fsdata) { *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - xfs_get_blocks); + return block_write_begin(file, mapping, pos, len, flags | AOP_FLAG_NOFS, + pagep, fsdata, xfs_get_blocks); } STATIC sector_t @@ -1698,7 +1517,7 @@ xfs_vm_bmap( struct inode *inode = (struct inode *)mapping->host; struct xfs_inode *ip = XFS_I(inode); - xfs_itrace_entry(XFS_I(inode)); + trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); xfs_iunlock(ip, XFS_IOLOCK_SHARED); diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 4cfc6ea..c5057fb 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -37,6 +37,8 @@ typedef struct xfs_ioend { size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct kiocb *io_iocb; + int io_result; } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; @@ -45,6 +47,6 @@ extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); extern void xfs_ioend_init(void); extern void xfs_ioend_wait(struct xfs_inode *); -extern void xfs_count_page_state(struct page *, int *, int *, int *); +extern void xfs_count_page_state(struct page *, int *, int *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 649ade8..ea79072 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -39,13 +39,12 @@ #include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trace.h" static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(int, gfp_t); +STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct shrinker xfs_buf_shake = { .shrink = xfsbufd_wakeup, @@ -340,7 +339,7 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(0, gfp_mask); + xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -579,9 +578,9 @@ _xfs_buf_read( XBF_READ_AHEAD | _XBF_RUN_QUEUES); status = xfs_buf_iorequest(bp); - if (!status && !(flags & XBF_ASYNC)) - status = xfs_buf_iowait(bp); - return status; + if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) + return status; + return xfs_buf_iowait(bp); } xfs_buf_t * @@ -897,36 +896,6 @@ xfs_buf_unlock( trace_xfs_buf_unlock(bp, _RET_IP_); } - -/* - * Pinning Buffer Storage in Memory - * Ensure that no attempt to force a buffer to disk will succeed. - */ -void -xfs_buf_pin( - xfs_buf_t *bp) -{ - trace_xfs_buf_pin(bp, _RET_IP_); - atomic_inc(&bp->b_pin_count); -} - -void -xfs_buf_unpin( - xfs_buf_t *bp) -{ - trace_xfs_buf_unpin(bp, _RET_IP_); - - if (atomic_dec_and_test(&bp->b_pin_count)) - wake_up_all(&bp->b_waiters); -} - -int -xfs_buf_ispin( - xfs_buf_t *bp) -{ - return atomic_read(&bp->b_pin_count); -} - STATIC void xfs_buf_wait_unpin( xfs_buf_t *bp) @@ -1018,13 +987,12 @@ xfs_bwrite( { int error; - bp->b_strat = xfs_bdstrat_cb; bp->b_mount = mp; bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ); xfs_buf_delwri_dequeue(bp); - xfs_buf_iostrategy(bp); + xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); if (error) @@ -1040,7 +1008,6 @@ xfs_bdwrite( { trace_xfs_buf_bdwrite(bp, _RET_IP_); - bp->b_strat = xfs_bdstrat_cb; bp->b_mount = mp; bp->b_flags &= ~XBF_READ; @@ -1075,7 +1042,6 @@ xfs_bioerror( XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); - XFS_BUF_CLR_BDSTRAT_FUNC(bp); xfs_biodone(bp); return EIO; @@ -1105,7 +1071,6 @@ xfs_bioerror_relse( XFS_BUF_DONE(bp); XFS_BUF_STALE(bp); XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_CLR_BDSTRAT_FUNC(bp); if (!(fl & XBF_ASYNC)) { /* * Mark b_error and B_ERROR _both_. @@ -1311,8 +1276,19 @@ submit_io: if (size) goto next_chunk; } else { - bio_put(bio); + /* + * if we get here, no pages were added to the bio. However, + * we can't just error out here - if the pages are locked then + * we have to unlock them otherwise we can hang on a later + * access to the page. + */ xfs_buf_ioerror(bp, EIO); + if (bp->b_flags & _XBF_PAGE_LOCKED) { + int i; + for (i = 0; i < bp->b_page_count; i++) + unlock_page(bp->b_pages[i]); + } + bio_put(bio); } } @@ -1762,6 +1738,7 @@ xfs_buf_runall_queues( STATIC int xfsbufd_wakeup( + struct shrinker *shrink, int priority, gfp_t mask) { @@ -1803,7 +1780,7 @@ xfs_buf_delwri_split( trace_xfs_buf_delwri_split(bp, _RET_IP_); ASSERT(bp->b_flags & XBF_DELWRI); - if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { + if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { if (!force && time_before(jiffies, bp->b_queuetime + age)) { xfs_buf_unlock(bp); @@ -1888,7 +1865,7 @@ xfsbufd( struct xfs_buf *bp; bp = list_first_entry(&tmp, struct xfs_buf, b_list); list_del_init(&bp->b_list); - xfs_buf_iostrategy(bp); + xfs_bdstrat_cb(bp); count++; } if (count) @@ -1935,7 +1912,7 @@ xfs_flush_buftarg( bp->b_flags &= ~XBF_ASYNC; list_add(&bp->b_list, &wait_list); } - xfs_buf_iostrategy(bp); + xfs_bdstrat_cb(bp); } if (wait) { diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 5fbecef..d072e5f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -44,57 +44,57 @@ typedef enum { XBRW_ZERO = 3, /* Zero target memory */ } xfs_buf_rw_t; -typedef enum { - XBF_READ = (1 << 0), /* buffer intended for reading from device */ - XBF_WRITE = (1 << 1), /* buffer intended for writing to device */ - XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */ - XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ - XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ - XBF_DELWRI = (1 << 6), /* buffer has dirty pages */ - XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ - XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ - XBF_ORDERED = (1 << 11), /* use ordered writes */ - XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ - XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log */ - - /* flags used only as arguments to access routines */ - XBF_LOCK = (1 << 14), /* lock requested */ - XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ - XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ - - /* flags used only internally */ - _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ - _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ - _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ - _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ - - /* - * Special flag for supporting metadata blocks smaller than a FSB. - * - * In this case we can have multiple xfs_buf_t on a single page and - * need to lock out concurrent xfs_buf_t readers as they only - * serialise access to the buffer. - * - * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation - * between reads of the page. Hence we can have one thread read the - * page and modify it, but then race with another thread that thinks - * the page is not up-to-date and hence reads it again. - * - * The result is that the first modifcation to the page is lost. - * This sort of AGF/AGI reading race can happen when unlinking inodes - * that require truncation and results in the AGI unlinked list - * modifications being lost. - */ - _XBF_PAGE_LOCKED = (1 << 22), - - /* - * If we try a barrier write, but it fails we have to communicate - * this to the upper layers. Unfortunately b_error gets overwritten - * when the buffer is re-issued so we have to add another flag to - * keep this information. - */ - _XFS_BARRIER_FAILED = (1 << 23), -} xfs_buf_flags_t; +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_MAPPED (1 << 2) /* buffer mapped (b_addr valid) */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ +#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ +#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */ +#define XBF_ORDERED (1 << 11)/* use ordered writes */ +#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ +#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ + +/* flags used only as arguments to access routines */ +#define XBF_LOCK (1 << 14)/* lock requested */ +#define XBF_TRYLOCK (1 << 15)/* lock requested, but do not wait */ +#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ + +/* flags used only internally */ +#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */ +#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ +#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ +#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ + +/* + * Special flag for supporting metadata blocks smaller than a FSB. + * + * In this case we can have multiple xfs_buf_t on a single page and + * need to lock out concurrent xfs_buf_t readers as they only + * serialise access to the buffer. + * + * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation + * between reads of the page. Hence we can have one thread read the + * page and modify it, but then race with another thread that thinks + * the page is not up-to-date and hence reads it again. + * + * The result is that the first modifcation to the page is lost. + * This sort of AGF/AGI reading race can happen when unlinking inodes + * that require truncation and results in the AGI unlinked list + * modifications being lost. + */ +#define _XBF_PAGE_LOCKED (1 << 22) + +/* + * If we try a barrier write, but it fails we have to communicate + * this to the upper layers. Unfortunately b_error gets overwritten + * when the buffer is re-issued so we have to add another flag to + * keep this information. + */ +#define _XFS_BARRIER_FAILED (1 << 23) + +typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ { XBF_READ, "READ" }, \ @@ -187,7 +187,6 @@ typedef struct xfs_buf { atomic_t b_io_remaining; /* #outstanding I/O requests */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ xfs_buf_relse_t b_relse; /* releasing function */ - xfs_buf_bdstrat_t b_strat; /* pre-write function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; @@ -245,11 +244,6 @@ extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); -static inline int xfs_buf_iostrategy(xfs_buf_t *bp) -{ - return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp); -} - static inline int xfs_buf_geterror(xfs_buf_t *bp) { return bp ? bp->b_error : ENOMEM; @@ -258,11 +252,6 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) /* Buffer Utility Routines */ extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); -/* Pinning Buffer Storage in Memory */ -extern void xfs_buf_pin(xfs_buf_t *); -extern void xfs_buf_unpin(xfs_buf_t *); -extern int xfs_buf_ispin(xfs_buf_t *); - /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_dequeue(xfs_buf_t *); extern void xfs_buf_delwri_promote(xfs_buf_t *); @@ -326,8 +315,6 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) #define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) #define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) -#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func)) -#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL) #define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) #define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) @@ -351,7 +338,7 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) #define XFS_BUF_SET_REF(bp, ref) do { } while (0) -#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp) +#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) #define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) #define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) @@ -370,8 +357,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) xfs_buf_rele(bp); } -#define xfs_bpin(bp) xfs_buf_pin(bp) -#define xfs_bunpin(bp) xfs_buf_unpin(bp) #define xfs_biodone(bp) xfs_buf_ioend(bp, 0) #define xfs_biomove(bp, off, len, data, rw) \ diff --git a/fs/xfs/linux-2.6/xfs_dmapi_priv.h b/fs/xfs/linux-2.6/xfs_dmapi_priv.h deleted file mode 100644 index a8b0b16..0000000 --- a/fs/xfs/linux-2.6/xfs_dmapi_priv.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DMAPI_PRIV_H__ -#define __XFS_DMAPI_PRIV_H__ - -/* - * Based on IO_ISDIRECT, decide which i_ flag is set. - */ -#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - DM_FLAGS_IMUX : 0) -#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) - -#endif /*__XFS_DMAPI_PRIV_H__*/ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index e7839ee..3764d74 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -23,13 +23,13 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_export.h" #include "xfs_vnodeops.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_inode_item.h" +#include "xfs_trace.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -132,8 +132,7 @@ xfs_nfs_get_inode( * fine and not an indication of a corrupted filesystem as clients can * send invalid file handles and we have to handle it gracefully.. */ - error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, - XFS_ILOCK_SHARED, &ip); + error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip); if (error) { /* * EINVAL means the inode cluster doesn't exist anymore. @@ -148,11 +147,10 @@ xfs_nfs_get_inode( } if (ip->i_d.di_gen != generation) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); + IRELE(ip); return ERR_PTR(-ENOENT); } - xfs_iunlock(ip, XFS_ILOCK_SHARED); return VFS_I(ip); } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 257a56b..ba8ad42 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -22,23 +22,15 @@ #include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_trans.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_alloc.h" -#include "xfs_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" #include "xfs_ioctl.h" @@ -108,7 +100,7 @@ xfs_file_fsync( int error = 0; int log_flushed = 0; - xfs_itrace_entry(ip); + trace_xfs_file_fsync(ip); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -XFS_ERROR(EIO); @@ -166,8 +158,7 @@ xfs_file_fsync( * transaction. So we play it safe and fire off the * transaction anyway. */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = _xfs_trans_commit(tp, 0, &log_flushed); @@ -275,20 +266,6 @@ xfs_file_aio_read( mutex_lock(&inode->i_mutex); xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); - int iolock = XFS_IOLOCK_SHARED; - - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size, - dmflags, &iolock); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_unlock(&inode->i_mutex); - return ret; - } - } - if (unlikely(ioflags & IO_ISDIRECT)) { if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, @@ -321,7 +298,6 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; int ioflags = 0; ssize_t ret; @@ -335,18 +311,6 @@ xfs_file_splice_read( xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_SHARED; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, - FILP_DELAY_FLAG(infilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); @@ -367,7 +331,6 @@ xfs_file_splice_write( { struct inode *inode = outfilp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; xfs_fsize_t isize, new_size; int ioflags = 0; ssize_t ret; @@ -382,18 +345,6 @@ xfs_file_splice_write( xfs_ilock(ip, XFS_IOLOCK_EXCL); - if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_EXCL; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, - FILP_DELAY_FLAG(outfilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return -error; - } - } - new_size = *ppos + count; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -463,7 +414,7 @@ xfs_zero_last_block( last_fsb = XFS_B_TO_FSBT(mp, isize); nimaps = 1; error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL, NULL); + &nimaps, NULL); if (error) { return error; } @@ -558,7 +509,7 @@ xfs_zero_eof( nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL, NULL); + 0, NULL, 0, &imap, &nimaps, NULL); if (error) { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; @@ -627,7 +578,6 @@ xfs_file_aio_write( int ioflags = 0; xfs_fsize_t isize, new_size; int iolock; - int eventsent = 0; size_t ocount = 0, count; int need_i_mutex; @@ -673,33 +623,6 @@ start: goto out_unlock_mutex; } - if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && - !(ioflags & IO_INVIS) && !eventsent)) { - int dmflags = FILP_DELAY_FLAG(file); - - if (need_i_mutex) - dmflags |= DM_FLAGS_IMUX; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip, - pos, count, dmflags, &iolock); - if (error) { - goto out_unlock_internal; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - eventsent = 1; - - /* - * The iolock was dropped and reacquired in XFS_SEND_DATA - * so we have to recheck the size when appending. - * We will only "goto start;" once, since having sent the - * event prevents another call to XFS_SEND_DATA, which is - * what allows the size to change in the first place. - */ - if ((file->f_flags & O_APPEND) && pos != ip->i_size) - goto start; - } - if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? @@ -830,22 +753,6 @@ write_retry: xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if (ret == -ENOSPC && - DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { - xfs_iunlock(ip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip, - DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL, - 0, 0, 0); /* Delay flag intentionally unused */ - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, iolock); - if (error) - goto out_unlock_internal; - goto start; - } - error = -ret; if (ret <= 0) goto out_unlock_internal; @@ -1014,9 +921,6 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, -#ifdef HAVE_FOP_OPEN_EXEC - .open_exec = xfs_file_open_exec, -#endif }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index b6918d7..1f279b0 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -21,10 +21,6 @@ #include "xfs_inode.h" #include "xfs_trace.h" -int fs_noerr(void) { return 0; } -int fs_nosys(void) { return ENOSYS; } -void fs_noval(void) { return; } - /* * note: all filemap functions return negative error codes. These * need to be inverted before returning to the xfs core functions. diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h deleted file mode 100644 index 82bb19b..0000000 --- a/fs/xfs/linux-2.6/xfs_fs_subr.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_FS_SUBR_H__ -#define __XFS_FS_SUBR_H__ - -extern int fs_noerr(void); -extern int fs_nosys(void); -extern void fs_noval(void); - -#endif /* __XFS_FS_SUBR_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index e59a810..237f5ff 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -23,24 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ioctl.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_rtalloc.h" #include "xfs_itable.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" @@ -908,7 +899,7 @@ xfs_ioctl_setattr( struct xfs_dquot *olddquot = NULL; int code; - xfs_itrace_entry(ip); + trace_xfs_ioctl_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); @@ -1043,8 +1034,7 @@ xfs_ioctl_setattr( } } - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * Change file ownership. Must be the owner or privileged. @@ -1116,16 +1106,7 @@ xfs_ioctl_setattr( xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); - if (code) - return code; - - if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) { - XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0, - (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0); - } - - return 0; + return code; error_return: xfs_qm_dqrele(udqp); @@ -1301,7 +1282,7 @@ xfs_file_ioctl( if (filp->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - xfs_itrace_entry(ip); + trace_xfs_file_ioctl(ip); switch (cmd) { case XFS_IOC_ALLOCSP: diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 52ed49e..6c83f7f 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -28,12 +28,8 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" #include "xfs_vnode.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -544,7 +540,7 @@ xfs_file_compat_ioctl( if (filp->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - xfs_itrace_entry(ip); + trace_xfs_file_compat_ioctl(ip); switch (cmd) { /* No size or alignment issues on any arch */ diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 44f0b2d..536b81e 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -24,21 +24,13 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" @@ -496,7 +488,7 @@ xfs_vn_getattr( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - xfs_itrace_entry(ip); + trace_xfs_getattr(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index facfb32..998a9d7 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -87,7 +87,6 @@ #include <xfs_aops.h> #include <xfs_super.h> #include <xfs_globals.h> -#include <xfs_fs_subr.h> #include <xfs_buf.h> /* diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 067cafb..29b9d64 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -16,7 +16,6 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_dmapi.h" #include "xfs_sb.h" #include "xfs_inum.h" #include "xfs_log.h" @@ -69,15 +68,15 @@ xfs_fs_set_xstate( if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - if (uflags & XFS_QUOTA_UDQ_ACCT) + if (uflags & FS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; - if (uflags & XFS_QUOTA_PDQ_ACCT) + if (uflags & FS_QUOTA_PDQ_ACCT) flags |= XFS_PQUOTA_ACCT; - if (uflags & XFS_QUOTA_GDQ_ACCT) + if (uflags & FS_QUOTA_GDQ_ACCT) flags |= XFS_GQUOTA_ACCT; - if (uflags & XFS_QUOTA_UDQ_ENFD) + if (uflags & FS_QUOTA_UDQ_ENFD) flags |= XFS_UQUOTA_ENFD; - if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) + if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) flags |= XFS_OQUOTA_ENFD; switch (op) { diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f2d1718..758df946 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -25,14 +25,11 @@ #include "xfs_ag.h" #include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -43,7 +40,6 @@ #include "xfs_error.h" #include "xfs_itable.h" #include "xfs_fsops.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -94,7 +90,6 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and * unwritten extent conversion */ #define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ -#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ #define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ #define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ #define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ @@ -116,9 +111,6 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ -#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ -#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ #define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ @@ -172,15 +164,13 @@ suffix_strtoul(char *s, char **endp, unsigned int base) STATIC int xfs_parseargs( struct xfs_mount *mp, - char *options, - char **mtpt) + char *options) { struct super_block *sb = mp->m_super; char *this_char, *value, *eov; int dsunit = 0; int dswidth = 0; int iosize = 0; - int dmapi_implies_ikeep = 1; __uint8_t iosizelog = 0; /* @@ -243,15 +233,10 @@ xfs_parseargs( if (!mp->m_logname) return ENOMEM; } else if (!strcmp(this_char, MNTOPT_MTPT)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL); - if (!*mtpt) - return ENOMEM; + cmn_err(CE_WARN, + "XFS: %s option not allowed on this system", + this_char); + return EINVAL; } else if (!strcmp(this_char, MNTOPT_RTDEV)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -288,8 +273,6 @@ xfs_parseargs( mp->m_flags &= ~XFS_MOUNT_GRPID; } else if (!strcmp(this_char, MNTOPT_WSYNC)) { mp->m_flags |= XFS_MOUNT_WSYNC; - } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { - mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { mp->m_flags |= XFS_MOUNT_NORECOVERY; } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { @@ -329,7 +312,6 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_IKEEP)) { mp->m_flags |= XFS_MOUNT_IKEEP; } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { - dmapi_implies_ikeep = 0; mp->m_flags &= ~XFS_MOUNT_IKEEP; } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; @@ -370,12 +352,6 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_OQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_DMAPI)) { - mp->m_flags |= XFS_MOUNT_DMAPI; - } else if (!strcmp(this_char, MNTOPT_XDSM)) { - mp->m_flags |= XFS_MOUNT_DMAPI; - } else if (!strcmp(this_char, MNTOPT_DMI)) { - mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { mp->m_flags |= XFS_MOUNT_DELAYLOG; cmn_err(CE_WARN, @@ -387,9 +363,11 @@ xfs_parseargs( cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); } else if (!strcmp(this_char, "osyncisdsync")) { - /* no-op, this is now the default */ cmn_err(CE_WARN, - "XFS: osyncisdsync is now the default, option is deprecated."); + "XFS: osyncisdsync has no effect, option is deprecated."); + } else if (!strcmp(this_char, "osyncisosync")) { + cmn_err(CE_WARN, + "XFS: osyncisosync has no effect, option is deprecated."); } else if (!strcmp(this_char, "irixsgid")) { cmn_err(CE_WARN, "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); @@ -430,12 +408,6 @@ xfs_parseargs( return EINVAL; } - if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) { - printk("XFS: %s option needs the mount point option as well\n", - MNTOPT_DMAPI); - return EINVAL; - } - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { cmn_err(CE_WARN, "XFS: sunit and swidth must be specified together"); @@ -449,18 +421,6 @@ xfs_parseargs( return EINVAL; } - /* - * Applications using DMI filesystems often expect the - * inode generation number to be monotonically increasing. - * If we delete inode chunks we break this assumption, so - * keep unused inode chunks on disk for DMI filesystems - * until we come up with a better solution. - * Note that if "ikeep" or "noikeep" mount options are - * supplied, then they are honored. - */ - if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep) - mp->m_flags |= XFS_MOUNT_IKEEP; - done: if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { /* @@ -539,10 +499,8 @@ xfs_showargs( { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, - { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, - { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { 0, NULL } @@ -947,7 +905,7 @@ xfs_fs_destroy_inode( { struct xfs_inode *ip = XFS_I(inode); - xfs_itrace_entry(ip); + trace_xfs_destroy_inode(ip); XFS_STATS_INC(vn_reclaim); @@ -1063,10 +1021,8 @@ xfs_log_inode( * an inode in another recent transaction. So we play it safe and * fire off the transaction anyway. */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); xfs_ilock_demote(ip, XFS_ILOCK_EXCL); @@ -1082,27 +1038,18 @@ xfs_fs_write_inode( struct xfs_mount *mp = ip->i_mount; int error = EAGAIN; - xfs_itrace_entry(ip); + trace_xfs_write_inode(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); if (wbc->sync_mode == WB_SYNC_ALL) { /* - * Make sure the inode has hit stable storage. By using the - * log and the fsync transactions we reduce the IOs we have - * to do here from two (log and inode) to just the log. - * - * Note: We still need to do a delwri write of the inode after - * this to flush it to the backing buffer so that bulkstat - * works properly if this is the first time the inode has been - * written. Because we hold the ilock atomically over the - * transaction commit and the inode flush we are guaranteed - * that the inode is not pinned when it returns. If the flush - * lock is already held, then the inode has already been - * flushed once and we don't need to flush it again. Hence - * the code will only flush the inode if it isn't already - * being flushed. + * Make sure the inode has made it it into the log. Instead + * of forcing it all the way to stable storage using a + * synchronous transaction we let the log force inside the + * ->sync_fs call do that for thus, which reduces the number + * of synchronous log foces dramatically. */ xfs_ioend_wait(ip); xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -1116,27 +1063,29 @@ xfs_fs_write_inode( * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. * This prevents the flush path from blocking on inodes inside - * another operation right now, they get caught later by xfs_sync. + * another operation right now, they get caught later by + * xfs_sync. */ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) goto out; - } - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) - goto out_unlock; + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) + goto out_unlock; - /* - * Now we have the flush lock and the inode is not pinned, we can check - * if the inode is really clean as we know that there are no pending - * transaction completions, it is not waiting on the delayed write - * queue and there is no IO in progress. - */ - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - error = 0; - goto out_unlock; + /* + * Now we have the flush lock and the inode is not pinned, we + * can check if the inode is really clean as we know that + * there are no pending transaction completions, it is not + * waiting on the delayed write queue and there is no IO in + * progress. + */ + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + error = 0; + goto out_unlock; + } + error = xfs_iflush(ip, 0); } - error = xfs_iflush(ip, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -1156,7 +1105,8 @@ xfs_fs_clear_inode( { xfs_inode_t *ip = XFS_I(inode); - xfs_itrace_entry(ip); + trace_xfs_clear_inode(ip); + XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); @@ -1193,22 +1143,13 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); + /* + * Unregister the memory shrinker before we tear down the mount + * structure so we don't have memory reclaim racing with us here. + */ + xfs_inode_shrinker_unregister(mp); xfs_syncd_stop(mp); - if (!(sb->s_flags & MS_RDONLY)) { - /* - * XXX(hch): this should be SYNC_WAIT. - * - * Or more likely not needed at all because the VFS is already - * calling ->sync_fs after shutting down all filestem - * operations and just before calling ->put_super. - */ - xfs_sync_data(mp, 0); - xfs_sync_attr(mp, 0); - } - - XFS_SEND_PREUNMOUNT(mp); - /* * Blow away any referenced inode in the filestreams cache. * This can and will cause log traffic as inodes go inactive @@ -1218,14 +1159,10 @@ xfs_fs_put_super( XFS_bflush(mp->m_ddev_targp); - XFS_SEND_UNMOUNT(mp); - xfs_unmountfs(mp); xfs_freesb(mp); - xfs_inode_shrinker_unregister(mp); xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); - xfs_dmops_put(mp); xfs_free_fsname(mp); kfree(mp); } @@ -1543,7 +1480,6 @@ xfs_fs_fill_super( struct inode *root; struct xfs_mount *mp = NULL; int flags = 0, error = ENOMEM; - char *mtpt = NULL; mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) @@ -1559,7 +1495,7 @@ xfs_fs_fill_super( mp->m_super = sb; sb->s_fs_info = mp; - error = xfs_parseargs(mp, (char *)data, &mtpt); + error = xfs_parseargs(mp, (char *)data); if (error) goto out_free_fsname; @@ -1571,16 +1507,12 @@ xfs_fs_fill_super( #endif sb->s_op = &xfs_super_operations; - error = xfs_dmops_get(mp); - if (error) - goto out_free_fsname; - if (silent) flags |= XFS_MFSI_QUIET; error = xfs_open_devices(mp); if (error) - goto out_put_dmops; + goto out_free_fsname; if (xfs_icsb_init_counters(mp)) mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; @@ -1608,8 +1540,6 @@ xfs_fs_fill_super( if (error) goto out_filestream_unmount; - XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname); - sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; @@ -1638,7 +1568,6 @@ xfs_fs_fill_super( xfs_inode_shrinker_register(mp); - kfree(mtpt); return 0; out_filestream_unmount: @@ -1648,11 +1577,8 @@ xfs_fs_fill_super( out_destroy_counters: xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); - out_put_dmops: - xfs_dmops_put(mp); out_free_fsname: xfs_free_fsname(mp); - kfree(mtpt); kfree(mp); out: return -error; @@ -1759,6 +1685,12 @@ xfs_init_zones(void) if (!xfs_trans_zone) goto out_destroy_ifork_zone; + xfs_log_item_desc_zone = + kmem_zone_init(sizeof(struct xfs_log_item_desc), + "xfs_log_item_desc"); + if (!xfs_log_item_desc_zone) + goto out_destroy_trans_zone; + /* * The size of the zone allocated buf log item is the maximum * size possible under XFS. This wastes a little bit of memory, @@ -1768,7 +1700,7 @@ xfs_init_zones(void) (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); if (!xfs_buf_item_zone) - goto out_destroy_trans_zone; + goto out_destroy_log_item_desc_zone; xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * @@ -1805,6 +1737,8 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_efd_zone); out_destroy_buf_item_zone: kmem_zone_destroy(xfs_buf_item_zone); + out_destroy_log_item_desc_zone: + kmem_zone_destroy(xfs_log_item_desc_zone); out_destroy_trans_zone: kmem_zone_destroy(xfs_trans_zone); out_destroy_ifork_zone: @@ -1835,6 +1769,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_efd_zone); kmem_zone_destroy(xfs_buf_item_zone); + kmem_zone_destroy(xfs_log_item_desc_zone); kmem_zone_destroy(xfs_trans_zone); kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_dabuf_zone); @@ -1883,7 +1818,6 @@ init_xfs_fs(void) goto out_cleanup_procfs; vfs_initquota(); - xfs_inode_shrinker_init(); error = register_filesystem(&xfs_fs_type); if (error) @@ -1911,7 +1845,6 @@ exit_xfs_fs(void) { vfs_exitquota(); unregister_filesystem(&xfs_fs_type); - xfs_inode_shrinker_destroy(); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 519618e..1ef4a4d 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -56,12 +56,6 @@ extern void xfs_qm_exit(void); # define XFS_BIGFS_STRING #endif -#ifdef CONFIG_XFS_DMAPI -# define XFS_DMAPI_STRING "dmapi support, " -#else -# define XFS_DMAPI_STRING -#endif - #ifdef DEBUG # define XFS_DBG_STRING "debug" #else @@ -72,7 +66,6 @@ extern void xfs_qm_exit(void); XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ XFS_BIGFS_STRING \ - XFS_DMAPI_STRING \ XFS_DBG_STRING /* DBG must be last */ struct xfs_inode; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index ef7f021..dfcbd98 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -24,25 +24,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_inode.h" #include "xfs_dinode.h" #include "xfs_error.h" -#include "xfs_mru_cache.h" #include "xfs_filestream.h" #include "xfs_vnodeops.h" -#include "xfs_utils.h" -#include "xfs_buf_item.h" #include "xfs_inode_item.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_trace.h" @@ -144,6 +133,41 @@ restart: return last_error; } +/* + * Select the next per-ag structure to iterate during the walk. The reclaim + * walk is optimised only to walk AGs with reclaimable inodes in them. + */ +static struct xfs_perag * +xfs_inode_ag_iter_next_pag( + struct xfs_mount *mp, + xfs_agnumber_t *first, + int tag) +{ + struct xfs_perag *pag = NULL; + + if (tag == XFS_ICI_RECLAIM_TAG) { + int found; + int ref; + + spin_lock(&mp->m_perag_lock); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, *first, 1, tag); + if (found <= 0) { + spin_unlock(&mp->m_perag_lock); + return NULL; + } + *first = pag->pag_agno + 1; + /* open coded pag reference increment */ + ref = atomic_inc_return(&pag->pag_ref); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_); + } else { + pag = xfs_perag_get(mp, *first); + (*first)++; + } + return pag; +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -154,16 +178,15 @@ xfs_inode_ag_iterator( int exclusive, int *nr_to_scan) { + struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t ag; int nr; nr = nr_to_scan ? *nr_to_scan : INT_MAX; - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, ag); + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, exclusive, &nr); xfs_perag_put(pag); @@ -285,7 +308,7 @@ xfs_sync_inode_attr( /* * Write out pagecache data for the whole filesystem. */ -int +STATIC int xfs_sync_data( struct xfs_mount *mp, int flags) @@ -306,7 +329,7 @@ xfs_sync_data( /* * Write out inode metadata (attributes) for the whole filesystem. */ -int +STATIC int xfs_sync_attr( struct xfs_mount *mp, int flags) @@ -339,8 +362,7 @@ xfs_commit_dummy_trans( xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -640,6 +662,17 @@ __xfs_inode_set_reclaim_tag( radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } pag->pag_ici_reclaimable++; } @@ -674,6 +707,16 @@ __xfs_inode_clear_reclaim_tag( radix_tree_tag_clear(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } } /* @@ -812,7 +855,36 @@ out: reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_ireclaim(ip); + + XFS_STATS_INC(xs_ig_reclaims); + /* + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. + */ + write_lock(&pag->pag_ici_lock); + if (!radix_tree_delete(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + ASSERT(0); + write_unlock(&pag->pag_ici_lock); + + /* + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. We get + * both the ilock and the iolock because the code may need to drop the + * ilock one but will still hold the iolock. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + xfs_inode_free(ip); return error; } @@ -828,83 +900,52 @@ xfs_reclaim_inodes( /* * Shrinker infrastructure. - * - * This is all far more complex than it needs to be. It adds a global list of - * mounts because the shrinkers can only call a global context. We need to make - * the shrinkers pass a context to avoid the need for global state. */ -static LIST_HEAD(xfs_mount_list); -static struct rw_semaphore xfs_mount_list_lock; - static int xfs_reclaim_inode_shrink( + struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct xfs_mount *mp; struct xfs_perag *pag; xfs_agnumber_t ag; - int reclaimable = 0; + int reclaimable; + mp = container_of(shrink, struct xfs_mount, m_inode_shrink); if (nr_to_scan) { if (!(gfp_mask & __GFP_FS)) return -1; - down_read(&xfs_mount_list_lock); - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { - xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); - if (nr_to_scan <= 0) - break; - } - up_read(&xfs_mount_list_lock); - } + /* if we don't exhaust the scan, don't bother coming back */ + if (nr_to_scan > 0) + return -1; + } - down_read(&xfs_mount_list_lock); - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - pag = xfs_perag_get(mp, ag); - reclaimable += pag->pag_ici_reclaimable; - xfs_perag_put(pag); - } + reclaimable = 0; + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, + XFS_ICI_RECLAIM_TAG))) { + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); } - up_read(&xfs_mount_list_lock); return reclaimable; } -static struct shrinker xfs_inode_shrinker = { - .shrink = xfs_reclaim_inode_shrink, - .seeks = DEFAULT_SEEKS, -}; - -void __init -xfs_inode_shrinker_init(void) -{ - init_rwsem(&xfs_mount_list_lock); - register_shrinker(&xfs_inode_shrinker); -} - -void -xfs_inode_shrinker_destroy(void) -{ - ASSERT(list_empty(&xfs_mount_list)); - unregister_shrinker(&xfs_inode_shrinker); -} - void xfs_inode_shrinker_register( struct xfs_mount *mp) { - down_write(&xfs_mount_list_lock); - list_add_tail(&mp->m_mplist, &xfs_mount_list); - up_write(&xfs_mount_list_lock); + mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; + mp->m_inode_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&mp->m_inode_shrink); } void xfs_inode_shrinker_unregister( struct xfs_mount *mp) { - down_write(&xfs_mount_list_lock); - list_del(&mp->m_mplist); - up_write(&xfs_mount_list_lock); + unregister_shrinker(&mp->m_inode_shrink); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index cdcbaac..fe78726 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -35,9 +35,6 @@ typedef struct xfs_sync_work { int xfs_syncd_init(struct xfs_mount *mp); void xfs_syncd_stop(struct xfs_mount *mp); -int xfs_sync_attr(struct xfs_mount *mp, int flags); -int xfs_sync_data(struct xfs_mount *mp, int flags); - int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); @@ -55,8 +52,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, int tag, int write_lock, int *nr_to_scan); -void xfs_inode_shrinker_init(void); -void xfs_inode_shrinker_destroy(void); void xfs_inode_shrinker_register(struct xfs_mount *mp); void xfs_inode_shrinker_unregister(struct xfs_mount *mp); diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c index d12be84..88d25d4 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -24,17 +24,13 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_ialloc.h" #include "xfs_itable.h" diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 73d5aa1..c657cdc 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name, \ unsigned long caller_ip), \ TP_ARGS(mp, agno, refcount, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, @@ -314,8 +317,6 @@ DEFINE_BUF_EVENT(xfs_buf_init); DEFINE_BUF_EVENT(xfs_buf_free); DEFINE_BUF_EVENT(xfs_buf_hold); DEFINE_BUF_EVENT(xfs_buf_rele); -DEFINE_BUF_EVENT(xfs_buf_pin); -DEFINE_BUF_EVENT(xfs_buf_unpin); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_iorequest); DEFINE_BUF_EVENT(xfs_buf_bawrite); @@ -538,7 +539,7 @@ DEFINE_LOCK_EVENT(xfs_ilock_nowait); DEFINE_LOCK_EVENT(xfs_ilock_demote); DEFINE_LOCK_EVENT(xfs_iunlock); -DECLARE_EVENT_CLASS(xfs_iget_class, +DECLARE_EVENT_CLASS(xfs_inode_class, TP_PROTO(struct xfs_inode *ip), TP_ARGS(ip), TP_STRUCT__entry( @@ -554,16 +555,38 @@ DECLARE_EVENT_CLASS(xfs_iget_class, __entry->ino) ) -#define DEFINE_IGET_EVENT(name) \ -DEFINE_EVENT(xfs_iget_class, name, \ +#define DEFINE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_inode_class, name, \ TP_PROTO(struct xfs_inode *ip), \ TP_ARGS(ip)) -DEFINE_IGET_EVENT(xfs_iget_skip); -DEFINE_IGET_EVENT(xfs_iget_reclaim); -DEFINE_IGET_EVENT(xfs_iget_found); -DEFINE_IGET_EVENT(xfs_iget_alloc); - -DECLARE_EVENT_CLASS(xfs_inode_class, +DEFINE_INODE_EVENT(xfs_iget_skip); +DEFINE_INODE_EVENT(xfs_iget_reclaim); +DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_hit); +DEFINE_INODE_EVENT(xfs_iget_miss); + +DEFINE_INODE_EVENT(xfs_getattr); +DEFINE_INODE_EVENT(xfs_setattr); +DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_alloc_file_space); +DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_readdir); +#ifdef CONFIG_XFS_POSIX_ACL +DEFINE_INODE_EVENT(xfs_check_acl); +#endif +DEFINE_INODE_EVENT(xfs_vm_bmap); +DEFINE_INODE_EVENT(xfs_file_ioctl); +DEFINE_INODE_EVENT(xfs_file_compat_ioctl); +DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_file_fsync); +DEFINE_INODE_EVENT(xfs_destroy_inode); +DEFINE_INODE_EVENT(xfs_write_inode); +DEFINE_INODE_EVENT(xfs_clear_inode); + +DEFINE_INODE_EVENT(xfs_dquot_dqalloc); +DEFINE_INODE_EVENT(xfs_dquot_dqdetach); + +DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), TP_STRUCT__entry( @@ -588,20 +611,71 @@ DECLARE_EVENT_CLASS(xfs_inode_class, (char *)__entry->caller_ip) ) -#define DEFINE_INODE_EVENT(name) \ -DEFINE_EVENT(xfs_inode_class, name, \ +#define DEFINE_IREF_EVENT(name) \ +DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ TP_ARGS(ip, caller_ip)) -DEFINE_INODE_EVENT(xfs_ihold); -DEFINE_INODE_EVENT(xfs_irele); -DEFINE_INODE_EVENT(xfs_inode_pin); -DEFINE_INODE_EVENT(xfs_inode_unpin); -DEFINE_INODE_EVENT(xfs_inode_unpin_nowait); +DEFINE_IREF_EVENT(xfs_ihold); +DEFINE_IREF_EVENT(xfs_irele); +DEFINE_IREF_EVENT(xfs_inode_pin); +DEFINE_IREF_EVENT(xfs_inode_unpin); +DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); + +DECLARE_EVENT_CLASS(xfs_namespace_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_ARGS(dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(dp)->i_sb->s_dev; + __entry->dp_ino = dp->i_ino; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dp ino 0x%llx name %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __get_str(name)) +) -/* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ -DEFINE_INODE_EVENT(xfs_inode); -#define xfs_itrace_entry(ip) \ - trace_xfs_inode(ip, _THIS_IP_) +#define DEFINE_NAMESPACE_EVENT(name) \ +DEFINE_EVENT(xfs_namespace_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_ARGS(dp, name)) +DEFINE_NAMESPACE_EVENT(xfs_remove); +DEFINE_NAMESPACE_EVENT(xfs_link); +DEFINE_NAMESPACE_EVENT(xfs_lookup); +DEFINE_NAMESPACE_EVENT(xfs_create); +DEFINE_NAMESPACE_EVENT(xfs_symlink); + +TRACE_EVENT(xfs_rename, + TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp, + struct xfs_name *src_name, struct xfs_name *target_name), + TP_ARGS(src_dp, target_dp, src_name, target_name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_dp_ino) + __field(xfs_ino_t, target_dp_ino) + __dynamic_array(char, src_name, src_name->len) + __dynamic_array(char, target_name, target_name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(src_dp)->i_sb->s_dev; + __entry->src_dp_ino = src_dp->i_ino; + __entry->target_dp_ino = target_dp->i_ino; + memcpy(__get_str(src_name), src_name->name, src_name->len); + memcpy(__get_str(target_name), target_name->name, target_name->len); + ), + TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" + " src name %s target name %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_dp_ino, + __entry->target_dp_ino, + __get_str(src_name), + __get_str(target_name)) +) DECLARE_EVENT_CLASS(xfs_dquot_class, TP_PROTO(struct xfs_dquot *dqp), @@ -681,9 +755,6 @@ DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); -/* not really iget events, but we re-use the format */ -DEFINE_IGET_EVENT(xfs_dquot_dqalloc); -DEFINE_IGET_EVENT(xfs_dquot_dqdetach); DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_PROTO(struct log *log, struct xlog_ticket *tic), @@ -831,33 +902,29 @@ DECLARE_EVENT_CLASS(xfs_page_class, __field(loff_t, size) __field(unsigned long, offset) __field(int, delalloc) - __field(int, unmapped) __field(int, unwritten) ), TP_fast_assign( - int delalloc = -1, unmapped = -1, unwritten = -1; + int delalloc = -1, unwritten = -1; if (page_has_buffers(page)) - xfs_count_page_state(page, &delalloc, - &unmapped, &unwritten); + xfs_count_page_state(page, &delalloc, &unwritten); __entry->dev = inode->i_sb->s_dev; __entry->ino = XFS_I(inode)->i_ino; __entry->pgoff = page_offset(page); __entry->size = i_size_read(inode); __entry->offset = off; __entry->delalloc = delalloc; - __entry->unmapped = unmapped; __entry->unwritten = unwritten; ), TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " - "delalloc %d unmapped %d unwritten %d", + "delalloc %d unwritten %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pgoff, __entry->size, __entry->offset, __entry->delalloc, - __entry->unmapped, __entry->unwritten) ) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 585e763..e1a2f68 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -23,25 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -64,8 +54,6 @@ flush lock - ditto. */ -STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *); - #ifdef DEBUG xfs_buftarg_t *xfs_dqerror_target; int xfs_do_dqerror; @@ -390,21 +378,14 @@ xfs_qm_dqalloc( return (ESRCH); } - /* - * xfs_trans_commit normally decrements the vnode ref count - * when it unlocks the inode. Since we want to keep the quota - * inode around, we bump the vnode ref count now. - */ - IHOLD(quotip); - - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL); nmaps = 1; if ((error = xfs_bmapi(tp, quotip, offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist, NULL))) { + &map, &nmaps, &flist))) { goto error0; } ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -520,7 +501,7 @@ xfs_qm_dqtobp( error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - NULL, 0, &map, &nmaps, NULL, NULL); + NULL, 0, &map, &nmaps, NULL); xfs_iunlock(quotip, XFS_ILOCK_SHARED); if (error) @@ -1141,6 +1122,46 @@ xfs_qm_dqrele( xfs_qm_dqput(dqp); } +/* + * This is the dquot flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the dquot is + * flushed to disk. It is responsible for removing the dquot logitem + * from the AIL if it has not been re-logged, and unlocking the dquot's + * flush lock. This behavior is very similar to that of inodes.. + */ +STATIC void +xfs_qm_dqflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; + xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_ail *ailp = lip->li_ailp; + + /* + * We only want to pull the item from the AIL if its + * location in the log has not changed since we started the flush. + * Thus, we only bother if the dquot's lsn has + * not changed. First we check the lsn outside the lock + * since it's cheaper, and then we recheck while + * holding the lock before removing the dquot from the AIL. + */ + if ((lip->li_flags & XFS_LI_IN_AIL) && + lip->li_lsn == qip->qli_flush_lsn) { + + /* xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); + if (lip->li_lsn == qip->qli_flush_lsn) + xfs_trans_ail_delete(ailp, lip); + else + spin_unlock(&ailp->xa_lock); + } + + /* + * Release the dq's flush lock since we're done with it. + */ + xfs_dqfunlock(dqp); +} /* * Write a modified dquot to disk. @@ -1222,8 +1243,9 @@ xfs_qm_dqflush( * Attach an iodone routine so that we can remove this dquot from the * AIL and release the flush lock once the dquot is synced to disk. */ - xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *)) - xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item)); + xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, + &dqp->q_logitem.qli_item); + /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. @@ -1247,50 +1269,6 @@ xfs_qm_dqflush( } -/* - * This is the dquot flushing I/O completion routine. It is called - * from interrupt level when the buffer containing the dquot is - * flushed to disk. It is responsible for removing the dquot logitem - * from the AIL if it has not been re-logged, and unlocking the dquot's - * flush lock. This behavior is very similar to that of inodes.. - */ -/*ARGSUSED*/ -STATIC void -xfs_qm_dqflush_done( - xfs_buf_t *bp, - xfs_dq_logitem_t *qip) -{ - xfs_dquot_t *dqp; - struct xfs_ail *ailp; - - dqp = qip->qli_dquot; - ailp = qip->qli_item.li_ailp; - - /* - * We only want to pull the item from the AIL if its - * location in the log has not changed since we started the flush. - * Thus, we only bother if the dquot's lsn has - * not changed. First we check the lsn outside the lock - * since it's cheaper, and then we recheck while - * holding the lock before removing the dquot from the AIL. - */ - if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && - qip->qli_item.li_lsn == qip->qli_flush_lsn) { - - /* xfs_trans_ail_delete() drops the AIL lock. */ - spin_lock(&ailp->xa_lock); - if (qip->qli_item.li_lsn == qip->qli_flush_lsn) - xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip); - else - spin_unlock(&ailp->xa_lock); - } - - /* - * Release the dq's flush lock since we're done with it. - */ - xfs_dqfunlock(dqp); -} - int xfs_qm_dqlock_nowait( xfs_dquot_t *dqp) diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 8d89a24..2a1f3dc 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -23,42 +23,36 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" +static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_dq_logitem, qli_item); +} + /* * returns the number of iovecs needed to log the given dquot item. */ -/* ARGSUSED */ STATIC uint xfs_qm_dquot_logitem_size( - xfs_dq_logitem_t *logitem) + struct xfs_log_item *lip) { /* * we need only two iovecs, one for the format, one for the real thing */ - return (2); + return 2; } /* @@ -66,22 +60,21 @@ xfs_qm_dquot_logitem_size( */ STATIC void xfs_qm_dquot_logitem_format( - xfs_dq_logitem_t *logitem, - xfs_log_iovec_t *logvec) + struct xfs_log_item *lip, + struct xfs_log_iovec *logvec) { - ASSERT(logitem); - ASSERT(logitem->qli_dquot); + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; + logvec->i_addr = &qlip->qli_format; logvec->i_len = sizeof(xfs_dq_logformat_t); logvec->i_type = XLOG_REG_TYPE_QFORMAT; logvec++; - logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; + logvec->i_addr = &qlip->qli_dquot->q_core; logvec->i_len = sizeof(xfs_disk_dquot_t); logvec->i_type = XLOG_REG_TYPE_DQUOT; - ASSERT(2 == logitem->qli_item.li_desc->lid_size); - logitem->qli_format.qlf_size = 2; + ASSERT(2 == lip->li_desc->lid_size); + qlip->qli_format.qlf_size = 2; } @@ -90,9 +83,9 @@ xfs_qm_dquot_logitem_format( */ STATIC void xfs_qm_dquot_logitem_pin( - xfs_dq_logitem_t *logitem) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp = logitem->qli_dquot; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); atomic_inc(&dqp->q_pincount); @@ -104,27 +97,18 @@ xfs_qm_dquot_logitem_pin( * dquot must have been previously pinned with a call to * xfs_qm_dquot_logitem_pin(). */ -/* ARGSUSED */ STATIC void xfs_qm_dquot_logitem_unpin( - xfs_dq_logitem_t *logitem) + struct xfs_log_item *lip, + int remove) { - xfs_dquot_t *dqp = logitem->qli_dquot; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; ASSERT(atomic_read(&dqp->q_pincount) > 0); if (atomic_dec_and_test(&dqp->q_pincount)) wake_up(&dqp->q_pinwait); } -/* ARGSUSED */ -STATIC void -xfs_qm_dquot_logitem_unpin_remove( - xfs_dq_logitem_t *logitem, - xfs_trans_t *tp) -{ - xfs_qm_dquot_logitem_unpin(logitem); -} - /* * Given the logitem, this writes the corresponding dquot entry to disk * asynchronously. This is called with the dquot entry securely locked; @@ -133,12 +117,10 @@ xfs_qm_dquot_logitem_unpin_remove( */ STATIC void xfs_qm_dquot_logitem_push( - xfs_dq_logitem_t *logitem) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp; - int error; - - dqp = logitem->qli_dquot; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); @@ -160,27 +142,25 @@ xfs_qm_dquot_logitem_push( xfs_dqunlock(dqp); } -/*ARGSUSED*/ STATIC xfs_lsn_t xfs_qm_dquot_logitem_committed( - xfs_dq_logitem_t *l, + struct xfs_log_item *lip, xfs_lsn_t lsn) { /* * We always re-log the entire dquot when it becomes dirty, * so, the latest copy _is_ the only one that matters. */ - return (lsn); + return lsn; } - /* * This is called to wait for the given dquot to be unpinned. * Most of these pin/unpin routines are plagiarized from inode code. */ void xfs_qm_dqunpin_wait( - xfs_dquot_t *dqp) + struct xfs_dquot *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); if (atomic_read(&dqp->q_pincount) == 0) @@ -206,13 +186,12 @@ xfs_qm_dqunpin_wait( */ STATIC void xfs_qm_dquot_logitem_pushbuf( - xfs_dq_logitem_t *qip) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp; - xfs_mount_t *mp; - xfs_buf_t *bp; + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_dquot *dqp = qlip->qli_dquot; + struct xfs_buf *bp; - dqp = qip->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* @@ -220,22 +199,20 @@ xfs_qm_dquot_logitem_pushbuf( * inode flush completed and the inode was taken off the AIL. * So, just get out. */ - if (completion_done(&dqp->q_flush) || - ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { + if (completion_done(&dqp->q_flush) || + !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_dqunlock(dqp); return; } - mp = dqp->q_mount; - bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); + + bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, + dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) return; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); - return; - } /* @@ -250,15 +227,14 @@ xfs_qm_dquot_logitem_pushbuf( */ STATIC uint xfs_qm_dquot_logitem_trylock( - xfs_dq_logitem_t *qip) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - dqp = qip->qli_dquot; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (! xfs_qm_dqlock_nowait(dqp)) + if (!xfs_qm_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; if (!xfs_dqflock_nowait(dqp)) { @@ -269,11 +245,10 @@ xfs_qm_dquot_logitem_trylock( return XFS_ITEM_PUSHBUF; } - ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); + ASSERT(lip->li_flags & XFS_LI_IN_AIL); return XFS_ITEM_SUCCESS; } - /* * Unlock the dquot associated with the log item. * Clear the fields of the dquot and dquot log item that @@ -282,12 +257,10 @@ xfs_qm_dquot_logitem_trylock( */ STATIC void xfs_qm_dquot_logitem_unlock( - xfs_dq_logitem_t *ql) + struct xfs_log_item *lip) { - xfs_dquot_t *dqp; + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - ASSERT(ql != NULL); - dqp = ql->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* @@ -304,43 +277,32 @@ xfs_qm_dquot_logitem_unlock( xfs_dqunlock(dqp); } - /* * this needs to stamp an lsn into the dquot, I think. * rpc's that look at user dquot's would then have to * push on the dependency recorded in the dquot */ -/* ARGSUSED */ STATIC void xfs_qm_dquot_logitem_committing( - xfs_dq_logitem_t *l, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - return; } - /* * This is the ops vector for dquots */ static struct xfs_item_ops xfs_dquot_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_dquot_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_qm_dquot_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*)) - xfs_qm_dquot_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_dquot_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push, - .iop_pushbuf = (void(*)(xfs_log_item_t*)) - xfs_qm_dquot_logitem_pushbuf, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_dquot_logitem_committing + .iop_size = xfs_qm_dquot_logitem_size, + .iop_format = xfs_qm_dquot_logitem_format, + .iop_pin = xfs_qm_dquot_logitem_pin, + .iop_unpin = xfs_qm_dquot_logitem_unpin, + .iop_trylock = xfs_qm_dquot_logitem_trylock, + .iop_unlock = xfs_qm_dquot_logitem_unlock, + .iop_committed = xfs_qm_dquot_logitem_committed, + .iop_push = xfs_qm_dquot_logitem_push, + .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf, + .iop_committing = xfs_qm_dquot_logitem_committing }; /* @@ -350,10 +312,9 @@ static struct xfs_item_ops xfs_dquot_item_ops = { */ void xfs_qm_dquot_logitem_init( - struct xfs_dquot *dqp) + struct xfs_dquot *dqp) { - xfs_dq_logitem_t *lp; - lp = &dqp->q_logitem; + struct xfs_dq_logitem *lp = &dqp->q_logitem; xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); @@ -374,16 +335,22 @@ xfs_qm_dquot_logitem_init( /*------------------ QUOTAOFF LOG ITEMS -------------------*/ +static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_qoff_logitem, qql_item); +} + + /* * This returns the number of iovecs needed to log the given quotaoff item. * We only need 1 iovec for an quotaoff item. It just logs the * quotaoff_log_format structure. */ -/*ARGSUSED*/ STATIC uint -xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_size( + struct xfs_log_item *lip) { - return (1); + return 1; } /* @@ -394,53 +361,46 @@ xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf) * slots in the quotaoff item have been filled. */ STATIC void -xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf, - xfs_log_iovec_t *log_vector) +xfs_qm_qoff_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) { - ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF); + struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); + + ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); - log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); + log_vector->i_addr = &qflip->qql_format; log_vector->i_len = sizeof(xfs_qoff_logitem_t); log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; - qf->qql_format.qf_size = 1; + qflip->qql_format.qf_size = 1; } - /* * Pinning has no meaning for an quotaoff item, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_pin( + struct xfs_log_item *lip) { - return; } - /* * Since pinning has no meaning for an quotaoff item, unpinning does * not either. */ -/*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_unpin( + struct xfs_log_item *lip, + int remove) { - return; -} - -/*ARGSUSED*/ -STATIC void -xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp) -{ - return; } /* * Quotaoff items have no locking, so just return success. */ -/*ARGSUSED*/ STATIC uint -xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_trylock( + struct xfs_log_item *lip) { return XFS_ITEM_LOCKED; } @@ -449,53 +409,51 @@ xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf) * Quotaoff items have no locking or pushing, so return failure * so that the caller doesn't bother with us. */ -/*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_unlock( + struct xfs_log_item *lip) { - return; } /* * The quotaoff-start-item is logged only once and cannot be moved in the log, * so simply return the lsn at which it's been logged. */ -/*ARGSUSED*/ STATIC xfs_lsn_t -xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn) +xfs_qm_qoff_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - return (lsn); + return lsn; } /* * There isn't much you can do to push on an quotaoff item. It is simply * stuck waiting for the log to be flushed to disk. */ -/*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf) +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip) { - return; } -/*ARGSUSED*/ STATIC xfs_lsn_t xfs_qm_qoffend_logitem_committed( - xfs_qoff_logitem_t *qfe, - xfs_lsn_t lsn) + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - xfs_qoff_logitem_t *qfs; - struct xfs_ail *ailp; + struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); + struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; + struct xfs_ail *ailp = qfs->qql_item.li_ailp; - qfs = qfe->qql_start_lip; - ailp = qfs->qql_item.li_ailp; - spin_lock(&ailp->xa_lock); /* * Delete the qoff-start logitem from the AIL. * xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); + kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; @@ -515,71 +473,52 @@ xfs_qm_qoffend_logitem_committed( * (truly makes the quotaoff irrevocable). If we do something else, * then maybe we don't need two. */ -/* ARGSUSED */ -STATIC void -xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) -{ - return; -} - -/* ARGSUSED */ STATIC void -xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) +xfs_qm_qoff_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) { - return; } static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_qoff_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) - xfs_qm_qoff_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoffend_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoffend_logitem_committing + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_trylock = xfs_qm_qoff_logitem_trylock, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoffend_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing }; /* * This is the ops vector shared by all quotaoff-start log items. */ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_qoff_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) - xfs_qm_qoff_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoff_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoff_logitem_committing + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_trylock = xfs_qm_qoff_logitem_trylock, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoff_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing }; /* * Allocate and initialize an quotaoff item of the correct quota type(s). */ -xfs_qoff_logitem_t * +struct xfs_qoff_logitem * xfs_qm_qoff_logitem_init( - struct xfs_mount *mp, - xfs_qoff_logitem_t *start, - uint flags) + struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags) { - xfs_qoff_logitem_t *qf; + struct xfs_qoff_logitem *qf; - qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); @@ -587,5 +526,5 @@ xfs_qm_qoff_logitem_init( qf->qql_format.qf_type = XFS_LI_QUOTAOFF; qf->qql_format.qf_flags = flags; qf->qql_start_lip = start; - return (qf); + return qf; } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 8c117ff..9a92407 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -23,25 +23,18 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_bmap.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -69,7 +62,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(int, gfp_t); +STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -1497,7 +1490,7 @@ xfs_qm_dqiterate( maxlblkcnt - lblkno, XFS_BMAPI_METADATA, NULL, - 0, map, &nmaps, NULL, NULL); + 0, map, &nmaps, NULL); xfs_iunlock(qip, XFS_ILOCK_SHARED); if (error) break; @@ -1669,7 +1662,8 @@ xfs_qm_dqusage_adjust( * making us disable quotas for the file system. */ if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) { - xfs_iput(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); *res = BULKSTAT_RV_GIVEUP; return error; } @@ -1682,7 +1676,8 @@ xfs_qm_dqusage_adjust( * Walk thru the extent list and count the realtime blocks. */ if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { - xfs_iput(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); if (udqp) xfs_qm_dqput(udqp); if (gdqp) @@ -2117,7 +2112,10 @@ xfs_qm_shake_freelist( */ /* ARGSUSED */ STATIC int -xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) +xfs_qm_shake( + struct shrinker *shrink, + int nr_to_scan, + gfp_t gfp_mask) { int ndqused, nfree, n; diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index 97b410c..bea02d7 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -23,25 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" -#include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_qm.h" diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 3d1fc79..8671a0b 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -23,25 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_bmap.h" -#include "xfs_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_qm.h" diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index b448776..45e5849 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -26,25 +26,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_bmap.h" -#include "xfs_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -248,40 +238,74 @@ out_unlock: return error; } +STATIC int +xfs_qm_scall_trunc_qfile( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + struct xfs_trans *tp; + int error; + + if (ino == NULLFSINO) + return 0; + + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + return error; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); + error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, + XFS_ITRUNCATE_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + goto out_put; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip); + + error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + goto out_unlock; + } + + xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_put: + IRELE(ip); + return error; +} + int xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { int error = 0, error2 = 0; - xfs_inode_t *qip; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); return XFS_ERROR(EINVAL); } - if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { - error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip); - if (!error) { - error = xfs_truncate_file(mp, qip); - IRELE(qip); - } - } - - if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) && - mp->m_sb.sb_gquotino != NULLFSINO) { - error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip); - if (!error2) { - error2 = xfs_truncate_file(mp, qip); - IRELE(qip); - } - } + if (flags & XFS_DQ_USER) + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); + if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) + error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); return error ? error : error2; } - /* * Switch on (a given) quota enforcement for a filesystem. This takes * effect immediately. @@ -786,9 +810,9 @@ xfs_qm_export_dquot( } #ifdef DEBUG - if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) || + if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || (XFS_IS_OQUOTA_ENFORCED(mp) && - (dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) && + (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { @@ -809,17 +833,17 @@ xfs_qm_export_qtype_flags( /* * Can't be more than one, or none. */ - ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) != - (XFS_PROJ_QUOTA | XFS_USER_QUOTA)); - ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) != - (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)); - ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) != - (XFS_USER_QUOTA | XFS_GROUP_QUOTA)); - ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0); + ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) != + (FS_PROJ_QUOTA | FS_USER_QUOTA)); + ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) != + (FS_PROJ_QUOTA | FS_GROUP_QUOTA)); + ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) != + (FS_USER_QUOTA | FS_GROUP_QUOTA)); + ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0); return (flags & XFS_DQ_USER) ? - XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? - XFS_PROJ_QUOTA : XFS_GROUP_QUOTA; + FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? + FS_PROJ_QUOTA : FS_GROUP_QUOTA; } STATIC uint @@ -830,16 +854,16 @@ xfs_qm_export_flags( uflags = 0; if (flags & XFS_UQUOTA_ACCT) - uflags |= XFS_QUOTA_UDQ_ACCT; + uflags |= FS_QUOTA_UDQ_ACCT; if (flags & XFS_PQUOTA_ACCT) - uflags |= XFS_QUOTA_PDQ_ACCT; + uflags |= FS_QUOTA_PDQ_ACCT; if (flags & XFS_GQUOTA_ACCT) - uflags |= XFS_QUOTA_GDQ_ACCT; + uflags |= FS_QUOTA_GDQ_ACCT; if (flags & XFS_UQUOTA_ENFD) - uflags |= XFS_QUOTA_UDQ_ENFD; + uflags |= FS_QUOTA_UDQ_ENFD; if (flags & (XFS_OQUOTA_ENFD)) { uflags |= (flags & XFS_GQUOTA_ACCT) ? - XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD; + FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; } return (uflags); } @@ -875,8 +899,9 @@ xfs_dqrele_inode( xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } - xfs_iput(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); return 0; } @@ -1143,7 +1168,8 @@ xfs_qm_internalqcheck_adjust( * of those now. */ if (! ipreleased) { - xfs_iput(ip, lock_flags); + xfs_iunlock(ip, lock_flags); + IRELE(ip); ipreleased = B_TRUE; goto again; } @@ -1160,7 +1186,8 @@ xfs_qm_internalqcheck_adjust( ASSERT(gd); xfs_qm_internalqcheck_dqadjust(ip, gd); } - xfs_iput(ip, lock_flags); + xfs_iunlock(ip, lock_flags); + IRELE(ip); *res = BULKSTAT_RV_DIDONE; return (0); } diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 061d827..7de91d1 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -23,25 +23,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" -#include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" @@ -59,16 +49,14 @@ xfs_trans_dqjoin( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_dq_logitem_t *lp = &dqp->q_logitem; - ASSERT(dqp->q_transp != tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(lp->qli_dquot == dqp); + ASSERT(dqp->q_logitem.qli_dquot == dqp); /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp)); + xfs_trans_add_item(tp, &dqp->q_logitem.qli_item); /* * Initialize i_transp so we can later determine if this dquot is @@ -93,16 +81,11 @@ xfs_trans_log_dquot( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_log_item_desc_t *lidp; - ASSERT(dqp->q_transp == tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; } /* @@ -874,9 +857,8 @@ xfs_trans_get_qoff_item( /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q); - - return (q); + xfs_trans_add_item(tp, &q->qql_item); + return q; } @@ -890,13 +872,8 @@ xfs_trans_log_quotaoff_item( xfs_trans_t *tp, xfs_qoff_logitem_t *qlp) { - xfs_log_item_desc_t *lidp; - - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; } STATIC void diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 3f3610a..975aa10 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -22,7 +22,6 @@ #include "xfs_sb.h" #include "xfs_inum.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index a7fbe8a..af168fa 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -24,18 +24,13 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -688,8 +683,6 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t ltbno; /* start bno of left side entry */ xfs_agblock_t ltbnoa; /* aligned ... */ xfs_extlen_t ltdiff; /* difference to left side entry */ - /*REFERENCED*/ - xfs_agblock_t ltend; /* end bno of left side entry */ xfs_extlen_t ltlen; /* length of left side entry */ xfs_extlen_t ltlena; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ @@ -814,8 +807,7 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - ltend = ltbno + ltlen; - ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; if (!xfs_alloc_fix_minleft(args)) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); @@ -828,7 +820,7 @@ xfs_alloc_ag_vextent_near( */ args->agbno = bnew; ASSERT(bnew >= ltbno); - ASSERT(bnew + blen <= ltend); + ASSERT(bnew + blen <= ltbno + ltlen); /* * Set up a cursor for the by-bno tree. */ @@ -1157,7 +1149,6 @@ xfs_alloc_ag_vextent_near( /* * Fix up the length and compute the useful address. */ - ltend = ltbno + ltlen; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); if (!xfs_alloc_fix_minleft(args)) { @@ -1170,7 +1161,7 @@ xfs_alloc_ag_vextent_near( (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, ltlen, <new); ASSERT(ltnew >= ltbno); - ASSERT(ltnew + rlen <= ltend); + ASSERT(ltnew + rlen <= ltbno + ltlen); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->agbno = ltnew; if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 6d05199..895009a 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -27,16 +27,16 @@ struct xfs_busy_extent; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. */ -typedef enum xfs_alloctype -{ - XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */ - XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */ - XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */ - XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */ - XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */ - XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */ - XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ -} xfs_alloctype_t; +#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */ +#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ +#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */ +#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ +#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ +#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ +#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */ + +/* this should become an enum again when the tracing code is fixed */ +typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_TYPES \ { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 83f4942..97f7328 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -24,19 +24,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_btree_trace.h" -#include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index b9c196a..c256824 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -25,19 +25,13 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_attr.h" @@ -325,8 +319,7 @@ xfs_attr_set_int( return (error); } - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); + xfs_trans_ijoin(args.trans, dp); /* * If the attribute list is non-existent or a shortform list, @@ -396,10 +389,8 @@ xfs_attr_set_int( * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); - } + if (committed) + xfs_trans_ijoin(args.trans, dp); /* * Commit the leaf transformation. We'll need another (linked) @@ -544,8 +535,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) * No need to make quota reservations here. We expect to release some * blocks not allocate in the common case. */ - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); + xfs_trans_ijoin(args.trans, dp); /* * Decide on what work routines to call based on the inode size. @@ -821,8 +811,7 @@ xfs_attr_inactive(xfs_inode_t *dp) * No need to make quota reservations here. We expect to release some * blocks, not allocate, in the common case. */ - xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(trans, dp); + xfs_trans_ijoin(trans, dp); /* * Decide on what work routines to call based on the inode size. @@ -981,10 +970,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); /* * Commit the current trans (including the inode) and start @@ -1085,10 +1072,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } else xfs_da_buf_done(bp); @@ -1161,10 +1146,8 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } else xfs_da_buf_done(bp); return(0); @@ -1317,10 +1300,8 @@ restart: * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); /* * Commit the node conversion and start the next @@ -1356,10 +1337,8 @@ restart: * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } else { /* * Addition succeeded, update Btree hashvals. @@ -1470,10 +1449,8 @@ restart: * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } /* @@ -1604,10 +1581,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); /* * Commit the Btree join operation and start a new trans. @@ -1658,10 +1633,8 @@ xfs_attr_node_removename(xfs_da_args_t *args) * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); } else xfs_da_brelse(args->trans, bp); } @@ -2004,7 +1977,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, args->rmtblkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, map, &nmap, NULL, NULL); + NULL, 0, map, &nmap, NULL); if (error) return(error); ASSERT(nmap >= 1); @@ -2083,7 +2056,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, args->firstblock, args->total, &map, &nmap, - args->flist, NULL); + args->flist); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -2099,10 +2072,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp); ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -2136,7 +2107,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) args->rmtblkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, 0, &map, &nmap, - NULL, NULL); + NULL); if (error) { return(error); } @@ -2201,7 +2172,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) args->rmtblkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, args->firstblock, 0, &map, &nmap, - args->flist, NULL); + args->flist); if (error) { return(error); } @@ -2239,7 +2210,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1, args->firstblock, args->flist, - NULL, &done); + &done); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); @@ -2255,10 +2226,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, args->dp); - } + if (committed) + xfs_trans_ijoin(args->trans, args->dp); /* * Close out trans and start the next one in the chain. diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index a90ce74..a6cff8e 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -24,8 +24,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" @@ -33,7 +31,6 @@ #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -2931,7 +2928,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, nmap = 1; error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, &map, &nmap, NULL, NULL); + NULL, 0, &map, &nmap, NULL); if (error) { return(error); } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 99587de..23f14e5 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -30,13 +30,10 @@ #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_dir2_data.h" #include "xfs_dir2_leaf.h" @@ -104,7 +101,6 @@ xfs_bmap_add_extent( xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork, /* data or attr fork */ int rsvd); /* OK to allocate reserved blocks */ @@ -122,7 +118,6 @@ xfs_bmap_add_extent_delay_real( xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd); /* OK to allocate reserved blocks */ /* @@ -135,7 +130,6 @@ xfs_bmap_add_extent_hole_delay( xfs_extnum_t idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp,/* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd); /* OK to allocate reserved blocks */ /* @@ -149,7 +143,6 @@ xfs_bmap_add_extent_hole_real( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork); /* data or attr fork */ /* @@ -162,8 +155,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta); /* Change made to incore extents */ + int *logflagsp); /* inode logging flags */ /* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. @@ -200,7 +192,6 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp,/* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork, /* data or attr fork */ int rsvd); /* OK to allocate reserved blocks */ @@ -489,7 +480,6 @@ xfs_bmap_add_extent( xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork, /* data or attr fork */ int rsvd) /* OK to use reserved data blocks */ { @@ -524,15 +514,6 @@ xfs_bmap_add_extent( logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else logflags = 0; - /* DELTA: single new extent */ - if (delta) { - if (delta->xed_startoff > new->br_startoff) - delta->xed_startoff = new->br_startoff; - if (delta->xed_blockcount < - new->br_startoff + new->br_blockcount) - delta->xed_blockcount = new->br_startoff + - new->br_blockcount; - } } /* * Any kind of new delayed allocation goes here. @@ -542,7 +523,7 @@ xfs_bmap_add_extent( ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, - &logflags, delta, rsvd))) + &logflags, rsvd))) goto done; } /* @@ -553,7 +534,7 @@ xfs_bmap_add_extent( ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, delta, whichfork))) + &logflags, whichfork))) goto done; } else { xfs_bmbt_irec_t prev; /* old extent at offset idx */ @@ -578,17 +559,17 @@ xfs_bmap_add_extent( XFS_BTCUR_BPRV_WASDEL); if ((error = xfs_bmap_add_extent_delay_real(ip, idx, &cur, new, &da_new, first, flist, - &logflags, delta, rsvd))) + &logflags, rsvd))) goto done; } else if (new->br_state == XFS_EXT_NORM) { ASSERT(new->br_state == XFS_EXT_NORM); if ((error = xfs_bmap_add_extent_unwritten_real( - ip, idx, &cur, new, &logflags, delta))) + ip, idx, &cur, new, &logflags))) goto done; } else { ASSERT(new->br_state == XFS_EXT_UNWRITTEN); if ((error = xfs_bmap_add_extent_unwritten_real( - ip, idx, &cur, new, &logflags, delta))) + ip, idx, &cur, new, &logflags))) goto done; } ASSERT(*curp == cur || *curp == NULL); @@ -601,7 +582,7 @@ xfs_bmap_add_extent( ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, delta, whichfork))) + new, &logflags, whichfork))) goto done; } } @@ -666,7 +647,6 @@ xfs_bmap_add_extent_delay_real( xfs_fsblock_t *first, /* pointer to firstblock variable */ xfs_bmap_free_t *flist, /* list of extents to be freed */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd) /* OK to use reserved data block allocation */ { xfs_btree_cur_t *cur; /* btree cursor */ @@ -797,11 +777,6 @@ xfs_bmap_add_extent_delay_real( goto done; } *dnew = 0; - /* DELTA: Three in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -832,10 +807,6 @@ xfs_bmap_add_extent_delay_real( goto done; } *dnew = 0; - /* DELTA: Two in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -867,10 +838,6 @@ xfs_bmap_add_extent_delay_real( goto done; } *dnew = 0; - /* DELTA: Two in-core extents are replaced by one. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -900,9 +867,6 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(i == 1, done); } *dnew = 0; - /* DELTA: The in-core extent described by new changed type. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -942,10 +906,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; - /* DELTA: The boundary between two in-core extents moved. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; case BMAP_LEFT_FILLING: @@ -990,9 +950,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); *dnew = temp; - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1031,10 +988,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; - /* DELTA: The boundary between two in-core extents moved. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_RIGHT_FILLING: @@ -1078,9 +1031,6 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); *dnew = temp; - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case 0: @@ -1161,9 +1111,6 @@ xfs_bmap_add_extent_delay_real( nullstartblock((int)temp2)); trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); *dnew = temp + temp2; - /* DELTA: One in-core extent is split in three. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -1179,13 +1126,6 @@ xfs_bmap_add_extent_delay_real( ASSERT(0); } *curp = cur; - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } done: *logflagsp = rval; return error; @@ -1204,8 +1144,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_extnum_t idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta) /* Change made to incore extents */ + int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ @@ -1219,8 +1158,6 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - xfs_filblks_t temp=0; - xfs_filblks_t temp2=0; #define LEFT r[0] #define RIGHT r[1] @@ -1341,11 +1278,6 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_blockcount, LEFT.br_state))) goto done; } - /* DELTA: Three in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: @@ -1382,10 +1314,6 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_state))) goto done; } - /* DELTA: Two in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1422,10 +1350,6 @@ xfs_bmap_add_extent_unwritten_real( newext))) goto done; } - /* DELTA: Two in-core extents are replaced by one. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: @@ -1453,9 +1377,6 @@ xfs_bmap_add_extent_unwritten_real( newext))) goto done; } - /* DELTA: The in-core extent described by new changed type. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: @@ -1501,10 +1422,6 @@ xfs_bmap_add_extent_unwritten_real( LEFT.br_state)) goto done; } - /* DELTA: The boundary between two in-core extents moved. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; case BMAP_LEFT_FILLING: @@ -1544,9 +1461,6 @@ xfs_bmap_add_extent_unwritten_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: @@ -1587,10 +1501,6 @@ xfs_bmap_add_extent_unwritten_real( newext))) goto done; } - /* DELTA: The boundary between two in-core extents moved. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; case BMAP_RIGHT_FILLING: @@ -1630,9 +1540,6 @@ xfs_bmap_add_extent_unwritten_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case 0: @@ -1692,9 +1599,6 @@ xfs_bmap_add_extent_unwritten_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in three. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: @@ -1710,13 +1614,6 @@ xfs_bmap_add_extent_unwritten_real( ASSERT(0); } *curp = cur; - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } done: *logflagsp = rval; return error; @@ -1736,7 +1633,6 @@ xfs_bmap_add_extent_hole_delay( xfs_extnum_t idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int rsvd) /* OK to allocate reserved blocks */ { xfs_bmbt_rec_host_t *ep; /* extent record for idx */ @@ -1747,7 +1643,6 @@ xfs_bmap_add_extent_hole_delay( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; /* temp for indirect calculations */ - xfs_filblks_t temp2=0; ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, idx); @@ -1819,9 +1714,6 @@ xfs_bmap_add_extent_hole_delay( xfs_iext_remove(ip, idx, 1, state); ip->i_df.if_lastex = idx - 1; - /* DELTA: Two in-core extents were replaced by one. */ - temp2 = temp; - temp = left.br_startoff; break; case BMAP_LEFT_CONTIG: @@ -1841,9 +1733,6 @@ xfs_bmap_add_extent_hole_delay( trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); ip->i_df.if_lastex = idx - 1; - /* DELTA: One in-core extent grew into a hole. */ - temp2 = temp; - temp = left.br_startoff; break; case BMAP_RIGHT_CONTIG: @@ -1862,9 +1751,6 @@ xfs_bmap_add_extent_hole_delay( trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); ip->i_df.if_lastex = idx; - /* DELTA: One in-core extent grew into a hole. */ - temp2 = temp; - temp = new->br_startoff; break; case 0: @@ -1876,9 +1762,6 @@ xfs_bmap_add_extent_hole_delay( oldlen = newlen = 0; xfs_iext_insert(ip, idx, 1, new, state); ip->i_df.if_lastex = idx; - /* DELTA: A new in-core extent was added in a hole. */ - temp2 = new->br_blockcount; - temp = new->br_startoff; break; } if (oldlen != newlen) { @@ -1889,13 +1772,6 @@ xfs_bmap_add_extent_hole_delay( * Nothing to do for disk quota accounting here. */ } - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } *logflagsp = 0; return 0; } @@ -1911,7 +1787,6 @@ xfs_bmap_add_extent_hole_real( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork) /* data or attr fork */ { xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */ @@ -1922,8 +1797,6 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - xfs_filblks_t temp=0; - xfs_filblks_t temp2=0; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); @@ -2020,11 +1893,6 @@ xfs_bmap_add_extent_hole_real( left.br_state))) goto done; } - /* DELTA: Two in-core extents were replaced by one. */ - temp = left.br_startoff; - temp2 = left.br_blockcount + - new->br_blockcount + - right.br_blockcount; break; case BMAP_LEFT_CONTIG: @@ -2056,10 +1924,6 @@ xfs_bmap_add_extent_hole_real( left.br_state))) goto done; } - /* DELTA: One in-core extent grew. */ - temp = left.br_startoff; - temp2 = left.br_blockcount + - new->br_blockcount; break; case BMAP_RIGHT_CONTIG: @@ -2092,10 +1956,6 @@ xfs_bmap_add_extent_hole_real( right.br_state))) goto done; } - /* DELTA: One in-core extent grew. */ - temp = new->br_startoff; - temp2 = new->br_blockcount + - right.br_blockcount; break; case 0: @@ -2123,18 +1983,8 @@ xfs_bmap_add_extent_hole_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: A new extent was added in a hole. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; } - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } done: *logflagsp = rval; return error; @@ -2959,7 +2809,6 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ int whichfork, /* data or attr fork */ int rsvd) /* OK to allocate reserved blocks */ { @@ -3265,14 +3114,6 @@ xfs_bmap_del_extent( if (da_old > da_new) xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int64_t)(da_old - da_new), rsvd); - if (delta) { - /* DELTA: report the original extent. */ - if (delta->xed_startoff > got.br_startoff) - delta->xed_startoff = got.br_startoff; - if (delta->xed_blockcount < got.br_startoff+got.br_blockcount) - delta->xed_blockcount = got.br_startoff + - got.br_blockcount; - } done: *logflagsp = flags; return error; @@ -3754,9 +3595,10 @@ xfs_bmap_add_attrfork( ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } ASSERT(ip->i_d.di_anextents == 0); - IHOLD(ip); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + switch (ip->i_d.di_format) { case XFS_DINODE_FMT_DEV: ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; @@ -4483,8 +4325,7 @@ xfs_bmapi( xfs_extlen_t total, /* total blocks needed */ xfs_bmbt_irec_t *mval, /* output: map values */ int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta) /* o: change made to incore extents */ + xfs_bmap_free_t *flist) /* i/o: list extents to free */ { xfs_fsblock_t abno; /* allocated block number */ xfs_extlen_t alen; /* allocated extent length */ @@ -4596,10 +4437,7 @@ xfs_bmapi( end = bno + len; obno = bno; bma.ip = NULL; - if (delta) { - delta->xed_startoff = NULLFILEOFF; - delta->xed_blockcount = 0; - } + while (bno < end && n < *nmap) { /* * Reading past eof, act as though there's a hole @@ -4620,19 +4458,13 @@ xfs_bmapi( * allocate the stuff asked for in this bmap call * but that wouldn't be as good. */ - if (wasdelay && !(flags & XFS_BMAPI_EXACT)) { + if (wasdelay) { alen = (xfs_extlen_t)got.br_blockcount; aoff = got.br_startoff; if (lastx != NULLEXTNUM && lastx) { ep = xfs_iext_get_ext(ifp, lastx - 1); xfs_bmbt_get_all(ep, &prev); } - } else if (wasdelay) { - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(len, - (got.br_startoff + - got.br_blockcount) - bno); - aoff = bno; } else { alen = (xfs_extlen_t) XFS_FILBLKS_MIN(len, MAXEXTLEN); @@ -4831,7 +4663,7 @@ xfs_bmapi( got.br_state = XFS_EXT_UNWRITTEN; } error = xfs_bmap_add_extent(ip, lastx, &cur, &got, - firstblock, flist, &tmp_logflags, delta, + firstblock, flist, &tmp_logflags, whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); logflags |= tmp_logflags; if (error) @@ -4927,7 +4759,7 @@ xfs_bmapi( } mval->br_state = XFS_EXT_NORM; error = xfs_bmap_add_extent(ip, lastx, &cur, mval, - firstblock, flist, &tmp_logflags, delta, + firstblock, flist, &tmp_logflags, whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); logflags |= tmp_logflags; if (error) @@ -5017,14 +4849,6 @@ xfs_bmapi( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); error = 0; - if (delta && delta->xed_startoff != NULLFILEOFF) { - /* A change was actually made. - * Note that delta->xed_blockount is an offset at this - * point and needs to be converted to a block count. - */ - ASSERT(delta->xed_blockcount > delta->xed_startoff); - delta->xed_blockcount -= delta->xed_startoff; - } error0: /* * Log everything. Do this after conversion, there's no point in @@ -5136,8 +4960,6 @@ xfs_bunmapi( xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta, /* o: change made to incore - extents */ int *done) /* set if not done yet */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ @@ -5196,10 +5018,7 @@ xfs_bunmapi( bno = start + len - 1; ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); - if (delta) { - delta->xed_startoff = NULLFILEOFF; - delta->xed_blockcount = 0; - } + /* * Check to see if the given block number is past the end of the * file, back up to the last block if so... @@ -5297,7 +5116,7 @@ xfs_bunmapi( } del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent(ip, lastx, &cur, &del, - firstblock, flist, &logflags, delta, + firstblock, flist, &logflags, XFS_DATA_FORK, 0); if (error) goto error0; @@ -5352,7 +5171,7 @@ xfs_bunmapi( prev.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent(ip, lastx - 1, &cur, &prev, firstblock, flist, &logflags, - delta, XFS_DATA_FORK, 0); + XFS_DATA_FORK, 0); if (error) goto error0; goto nodelete; @@ -5361,7 +5180,7 @@ xfs_bunmapi( del.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent(ip, lastx, &cur, &del, firstblock, flist, &logflags, - delta, XFS_DATA_FORK, 0); + XFS_DATA_FORK, 0); if (error) goto error0; goto nodelete; @@ -5414,7 +5233,7 @@ xfs_bunmapi( goto error0; } error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, - &tmp_logflags, delta, whichfork, rsvd); + &tmp_logflags, whichfork, rsvd); logflags |= tmp_logflags; if (error) goto error0; @@ -5471,14 +5290,6 @@ nodelete: ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); error = 0; - if (delta && delta->xed_startoff != NULLFILEOFF) { - /* A change was actually made. - * Note that delta->xed_blockount is an offset at this - * point and needs to be converted to a block count. - */ - ASSERT(delta->xed_blockcount > delta->xed_startoff); - delta->xed_blockcount -= delta->xed_startoff; - } error0: /* * Log everything. Do this after conversion, there's no point in @@ -5605,28 +5416,6 @@ xfs_getbmap( prealloced = 0; fixlen = 1LL << 32; } else { - /* - * If the BMV_IF_NO_DMAPI_READ interface bit specified, do - * not generate a DMAPI read event. Otherwise, if the - * DM_EVENT_READ bit is set for the file, generate a read - * event in order that the DMAPI application may do its thing - * before we return the extents. Usually this means restoring - * user file data to regions of the file that look like holes. - * - * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify - * BMV_IF_NO_DMAPI_READ so that read events are generated. - * If this were not true, callers of ioctl(XFS_IOC_GETBMAP) - * could misinterpret holes in a DMAPI file as true holes, - * when in fact they may represent offline user data. - */ - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && - !(iflags & BMV_IF_NO_DMAPI_READ)) { - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, - 0, 0, 0, NULL); - if (error) - return XFS_ERROR(error); - } - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_format != XFS_DINODE_FMT_BTREE && ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) @@ -5713,7 +5502,7 @@ xfs_getbmap( error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), XFS_BB_TO_FSB(mp, bmv->bmv_length), bmapi_flags, NULL, 0, map, &nmap, - NULL, NULL); + NULL); if (error) goto out_free_map; ASSERT(nmap <= subnex); @@ -5859,66 +5648,34 @@ xfs_bmap_eof( } #ifdef DEBUG -STATIC -xfs_buf_t * +STATIC struct xfs_buf * xfs_bmap_get_bp( - xfs_btree_cur_t *cur, + struct xfs_btree_cur *cur, xfs_fsblock_t bno) { - int i; - xfs_buf_t *bp; + struct xfs_log_item_desc *lidp; + int i; if (!cur) - return(NULL); - - bp = NULL; - for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - bp = cur->bc_bufs[i]; - if (!bp) break; - if (XFS_BUF_ADDR(bp) == bno) - break; /* Found it */ - } - if (i == XFS_BTREE_MAXLEVELS) - bp = NULL; - - if (!bp) { /* Chase down all the log items to see if the bp is there */ - xfs_log_item_chunk_t *licp; - xfs_trans_t *tp; - - tp = cur->bc_tp; - licp = &tp->t_items; - while (!bp && licp != NULL) { - if (xfs_lic_are_all_free(licp)) { - licp = licp->lic_next; - continue; - } - for (i = 0; i < licp->lic_unused; i++) { - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - xfs_buf_log_item_t *bip; - xfs_buf_t *lbp; - - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - lip = lidp->lid_item; - if (lip->li_type != XFS_LI_BUF) - continue; + return NULL; - bip = (xfs_buf_log_item_t *)lip; - lbp = bip->bli_buf; + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } - if (XFS_BUF_ADDR(lbp) == bno) { - bp = lbp; - break; /* Found it */ - } - } - licp = licp->lic_next; - } + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; } - return(bp); + + return NULL; } STATIC void diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 419dafb..b13569a 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -28,20 +28,6 @@ struct xfs_trans; extern kmem_zone_t *xfs_bmap_free_item_zone; /* - * DELTA: describe a change to the in-core extent list. - * - * Internally the use of xed_blockount is somewhat funky. - * xed_blockcount contains an offset much of the time because this - * makes merging changes easier. (xfs_fileoff_t and xfs_filblks_t are - * the same underlying type). - */ -typedef struct xfs_extdelta -{ - xfs_fileoff_t xed_startoff; /* offset of range */ - xfs_filblks_t xed_blockcount; /* blocks in range */ -} xfs_extdelta_t; - -/* * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ @@ -82,16 +68,13 @@ typedef struct xfs_bmap_free #define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ #define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ #define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ -#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */ -#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */ -#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */ -#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */ -#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */ -#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */ +#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ +#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */ +#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ /* combine contig. space */ -#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ -/* XFS_BMAPI_DIRECT_IO 0x800 */ -#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */ +#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */ +#define XFS_BMAPI_CONVERT 0x200 /* unwritten extent conversion - */ /* need write cache flushing and no */ /* additional allocation alignments */ @@ -100,9 +83,7 @@ typedef struct xfs_bmap_free { XFS_BMAPI_DELAY, "DELAY" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ - { XFS_BMAPI_EXACT, "EXACT" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ - { XFS_BMAPI_ASYNC, "ASYNC" }, \ { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ @@ -310,9 +291,7 @@ xfs_bmapi( xfs_extlen_t total, /* total blocks needed */ struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta); /* o: change made to incore - extents */ + xfs_bmap_free_t *flist); /* i/o: list extents to free */ /* * Map file blocks to filesystem blocks, simple version. @@ -346,8 +325,6 @@ xfs_bunmapi( xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta, /* o: change made to incore - extents */ int *done); /* set if not done yet */ /* diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 416e47e..87d3c10 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -24,21 +24,16 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_btree.h" #include "xfs_btree_trace.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 96be4b0..829af92 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -24,20 +24,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_btree.h" #include "xfs_btree_trace.h" -#include "xfs_ialloc.h" #include "xfs_error.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 02a8098..1b09d7a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -24,7 +24,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" @@ -34,6 +33,12 @@ kmem_zone_t *xfs_buf_item_zone; +static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_buf_log_item, bli_item); +} + + #ifdef XFS_TRANS_DEBUG /* * This function uses an alternate strategy for tracking the bytes @@ -151,12 +156,13 @@ STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); */ STATIC uint xfs_buf_item_size( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - uint nvecs; - int next_bit; - int last_bit; - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint nvecs; + int next_bit; + int last_bit; ASSERT(atomic_read(&bip->bli_refcount) > 0); if (bip->bli_flags & XFS_BLI_STALE) { @@ -170,7 +176,6 @@ xfs_buf_item_size( return 1; } - bp = bip->bli_buf; ASSERT(bip->bli_flags & XFS_BLI_LOGGED); nvecs = 1; last_bit = xfs_next_bit(bip->bli_format.blf_data_map, @@ -219,13 +224,13 @@ xfs_buf_item_size( */ STATIC void xfs_buf_item_format( - xfs_buf_log_item_t *bip, - xfs_log_iovec_t *log_vector) + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) { + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; uint base_size; uint nvecs; - xfs_log_iovec_t *vecp; - xfs_buf_t *bp; int first_bit; int last_bit; int next_bit; @@ -235,8 +240,6 @@ xfs_buf_item_format( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); - bp = bip->bli_buf; - vecp = log_vector; /* * The size of the base structure is the size of the @@ -248,7 +251,7 @@ xfs_buf_item_format( base_size = (uint)(sizeof(xfs_buf_log_format_t) + ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); - vecp->i_addr = (xfs_caddr_t)&bip->bli_format; + vecp->i_addr = &bip->bli_format; vecp->i_len = base_size; vecp->i_type = XLOG_REG_TYPE_BFORMAT; vecp++; @@ -263,7 +266,7 @@ xfs_buf_item_format( */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - xfs_log_item_in_current_chkpt(&bip->bli_item))) + xfs_log_item_in_current_chkpt(lip))) bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } @@ -356,66 +359,90 @@ xfs_buf_item_format( /* * This is called to pin the buffer associated with the buf log item in memory - * so it cannot be written out. Simply call bpin() on the buffer to do this. + * so it cannot be written out. * * We also always take a reference to the buffer log item here so that the bli * is held while the item is pinned in memory. This means that we can * unconditionally drop the reference count a transaction holds when the * transaction is completed. */ - STATIC void xfs_buf_item_pin( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); - bp = bip->bli_buf; - ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_ISBUSY(bip->bli_buf)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); - atomic_inc(&bip->bli_refcount); + trace_xfs_buf_item_pin(bip); - xfs_bpin(bp); -} + atomic_inc(&bip->bli_refcount); + atomic_inc(&bip->bli_buf->b_pin_count); +} /* * This is called to unpin the buffer associated with the buf log * item which was previously pinned with a call to xfs_buf_item_pin(). - * Just call bunpin() on the buffer to do this. * * Also drop the reference to the buf item for the current transaction. * If the XFS_BLI_STALE flag is set and we are the last reference, * then free up the buf log item and unlock the buffer. + * + * If the remove flag is set we are called from uncommit in the + * forced-shutdown path. If that is true and the reference count on + * the log item is going to drop to zero we need to free the item's + * descriptor in the transaction. */ STATIC void xfs_buf_item_unpin( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip, + int remove) { - struct xfs_ail *ailp; - xfs_buf_t *bp; - int freed; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + xfs_buf_t *bp = bip->bli_buf; + struct xfs_ail *ailp = lip->li_ailp; int stale = bip->bli_flags & XFS_BLI_STALE; + int freed; - bp = bip->bli_buf; - ASSERT(bp != NULL); ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); + trace_xfs_buf_item_unpin(bip); freed = atomic_dec_and_test(&bip->bli_refcount); - ailp = bip->bli_item.li_ailp; - xfs_bunpin(bp); + + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + trace_xfs_buf_item_unpin_stale(bip); + if (remove) { + /* + * We have to remove the log item from the transaction + * as we are about to release our reference to the + * buffer. If we don't, the unlock that occurs later + * in xfs_trans_uncommit() will ry to reference the + * buffer which we no longer have a hold on. + */ + xfs_trans_del_item(lip); + + /* + * Since the transaction no longer refers to the buffer, + * the buffer should no longer refer to the transaction. + */ + XFS_BUF_SET_FSPRIVATE2(bp, NULL); + } + /* * If we get called here because of an IO error, we may * or may not have the item on the AIL. xfs_trans_ail_delete() @@ -437,48 +464,6 @@ xfs_buf_item_unpin( } /* - * this is called from uncommit in the forced-shutdown path. - * we need to check to see if the reference count on the log item - * is going to drop to zero. If so, unpin will free the log item - * so we need to free the item's descriptor (that points to the item) - * in the transaction. - */ -STATIC void -xfs_buf_item_unpin_remove( - xfs_buf_log_item_t *bip, - xfs_trans_t *tp) -{ - /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */ - if ((atomic_read(&bip->bli_refcount) == 1) && - (bip->bli_flags & XFS_BLI_STALE)) { - /* - * yes -- We can safely do some work here and then call - * buf_item_unpin to do the rest because we are - * are holding the buffer locked so no one else will be - * able to bump up the refcount. We have to remove the - * log item from the transaction as we are about to release - * our reference to the buffer. If we don't, the unlock that - * occurs later in the xfs_trans_uncommit() will try to - * reference the buffer which we no longer have a hold on. - */ - struct xfs_log_item_desc *lidp; - - ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); - trace_xfs_buf_item_unpin_stale(bip); - - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip); - xfs_trans_free_item(tp, lidp); - - /* - * Since the transaction no longer refers to the buffer, the - * buffer should no longer refer to the transaction. - */ - XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL); - } - xfs_buf_item_unpin(bip); -} - -/* * This is called to attempt to lock the buffer associated with this * buf log item. Don't sleep on the buffer lock. If we can't get * the lock right away, return 0. If we can get the lock, take a @@ -488,11 +473,11 @@ xfs_buf_item_unpin_remove( */ STATIC uint xfs_buf_item_trylock( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; - bp = bip->bli_buf; if (XFS_BUF_ISPINNED(bp)) return XFS_ITEM_PINNED; if (!XFS_BUF_CPSEMA(bp)) @@ -529,13 +514,12 @@ xfs_buf_item_trylock( */ STATIC void xfs_buf_item_unlock( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - int aborted; - xfs_buf_t *bp; - uint hold; - - bp = bip->bli_buf; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + int aborted; + uint hold; /* Clear the buffer's association with this transaction. */ XFS_BUF_SET_FSPRIVATE2(bp, NULL); @@ -546,7 +530,7 @@ xfs_buf_item_unlock( * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ - aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; + aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; /* * Before possibly freeing the buf item, determine if we should @@ -607,16 +591,16 @@ xfs_buf_item_unlock( */ STATIC xfs_lsn_t xfs_buf_item_committed( - xfs_buf_log_item_t *bip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + trace_xfs_buf_item_committed(bip); - if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - (bip->bli_item.li_lsn != 0)) { - return bip->bli_item.li_lsn; - } - return (lsn); + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) + return lip->li_lsn; + return lsn; } /* @@ -626,15 +610,16 @@ xfs_buf_item_committed( */ STATIC void xfs_buf_item_push( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); + trace_xfs_buf_item_push(bip); - bp = bip->bli_buf; - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); xfs_buf_relse(bp); } @@ -646,22 +631,24 @@ xfs_buf_item_push( */ STATIC void xfs_buf_item_pushbuf( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(XFS_BUF_ISDELAYWRITE(bp)); + trace_xfs_buf_item_pushbuf(bip); - bp = bip->bli_buf; - ASSERT(XFS_BUF_ISDELAYWRITE(bp)); xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); } -/* ARGSUSED */ STATIC void -xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) +xfs_buf_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) { } @@ -669,21 +656,16 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) * This is the ops vector shared by all buf log items. */ static struct xfs_item_ops xfs_buf_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_buf_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) - xfs_buf_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_buf_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, - .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_buf_item_committing + .iop_size = xfs_buf_item_size, + .iop_format = xfs_buf_item_format, + .iop_pin = xfs_buf_item_pin, + .iop_unpin = xfs_buf_item_unpin, + .iop_trylock = xfs_buf_item_trylock, + .iop_unlock = xfs_buf_item_unlock, + .iop_committed = xfs_buf_item_committed, + .iop_push = xfs_buf_item_push, + .iop_pushbuf = xfs_buf_item_pushbuf, + .iop_committing = xfs_buf_item_committing }; @@ -712,7 +694,6 @@ xfs_buf_item_init( */ if (bp->b_mount != mp) bp->b_mount = mp; - XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); if (lip->li_type == XFS_LI_BUF) { @@ -1098,15 +1079,14 @@ xfs_buf_error_relse( * It is called by xfs_buf_iodone_callbacks() above which will take * care of cleaning up the buffer itself. */ -/* ARGSUSED */ void xfs_buf_iodone( - xfs_buf_t *bp, - xfs_buf_log_item_t *bip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { - struct xfs_ail *ailp = bip->bli_item.li_ailp; + struct xfs_ail *ailp = lip->li_ailp; - ASSERT(bip->bli_buf == bp); + ASSERT(BUF_ITEM(lip)->bli_buf == bp); xfs_buf_rele(bp); @@ -1120,6 +1100,6 @@ xfs_buf_iodone( * Either way, AIL is useless if we're forcing a shutdown. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); - xfs_buf_item_free(bip); + xfs_trans_ail_delete(ailp, lip); + xfs_buf_item_free(BUF_ITEM(lip)); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index f20bb47..0e2ed43 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -124,7 +124,7 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); void xfs_buf_iodone_callbacks(struct xfs_buf *); -void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *); +void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); #ifdef XFS_TRANS_DEBUG void diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 0ca556b..30fa0e2 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -25,19 +25,14 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" @@ -581,16 +576,14 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, xfs_da_intnode_t *node; xfs_da_node_entry_t *btree; int tmp; - xfs_mount_t *mp; node = oldblk->bp->data; - mp = state->mp; ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) - ASSERT(newblk->blkno >= mp->m_dirleafblk && - newblk->blkno < mp->m_dirfreeblk); + ASSERT(newblk->blkno >= state->mp->m_dirleafblk && + newblk->blkno < state->mp->m_dirfreeblk); /* * We may need to make some room before we insert the new node. @@ -1601,7 +1594,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, - args->flist, NULL))) { + args->flist))) { return error; } ASSERT(nmap <= 1); @@ -1622,8 +1615,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| XFS_BMAPI_METADATA, args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist, - NULL))) { + &mapp[mapi], &nmap, args->flist))) { kmem_free(mapp); return error; } @@ -1884,7 +1876,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, */ if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, NULL, + 0, args->firstblock, args->flist, &done)) == ENOSPC) { if (w != XFS_DATA_FORK) break; @@ -1989,7 +1981,7 @@ xfs_da_do_buf( nfsb, XFS_BMAPI_METADATA | xfs_bmapi_aflag(whichfork), - NULL, 0, mapp, &nmap, NULL, NULL))) + NULL, 0, mapp, &nmap, NULL))) goto exit0; } } else { diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 7f159d2..3b9582c 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -24,24 +24,15 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_dfrag.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -425,11 +416,8 @@ xfs_swap_extents( } - IHOLD(ip); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - IHOLD(tip); - xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 42520f0..a1321bc 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -25,13 +25,11 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -382,7 +380,7 @@ xfs_readdir( int rval; /* return value */ int v; /* type-checking value */ - xfs_itrace_entry(dp); + trace_xfs_readdir(dp); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); @@ -549,7 +547,7 @@ xfs_dir2_grow_inode( if ((error = xfs_bmapi(tp, dp, bno, count, XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, - args->flist, NULL))) + args->flist))) return error; ASSERT(nmap <= 1); if (nmap == 1) { @@ -581,8 +579,7 @@ xfs_dir2_grow_inode( if ((error = xfs_bmapi(tp, dp, b, c, XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist, - NULL))) { + &mapp[mapi], &nmap, args->flist))) { kmem_free(mapp); return error; } @@ -715,7 +712,7 @@ xfs_dir2_shrink_inode( */ if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, - NULL, &done))) { + &done))) { /* * ENOSPC actually can happen if we're in a removename with * no space reservation, and the resulting block removal diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 779a267..580d99c 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -24,12 +24,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -1073,10 +1071,10 @@ xfs_dir2_sf_to_block( */ buf_len = dp->i_df.if_bytes; - buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); + buf = kmem_alloc(buf_len, KM_SLEEP); - memcpy(buf, sfp, dp->i_df.if_bytes); - xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); + memcpy(buf, sfp, buf_len); + xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK); dp->i_d.di_size = 0; xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); /* diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 498f8d6..921595b 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -24,12 +24,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_dir2_data.h" diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index e2d8985..504be86 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -25,11 +25,9 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_attr_sf.h" #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -875,7 +873,7 @@ xfs_dir2_leaf_getdents( xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - map_off, XFS_BMAPI_METADATA, NULL, 0, - &map[map_valid], &nmap, NULL, NULL); + &map[map_valid], &nmap, NULL); /* * Don't know if we should ignore this or * try to return an error. diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 78fc4d9..f9a0864 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -24,12 +24,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index c1a5945..b1bae6b 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -24,12 +24,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h deleted file mode 100644 index 2813cdd..0000000 --- a/fs/xfs/xfs_dmapi.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DMAPI_H__ -#define __XFS_DMAPI_H__ - -/* Values used to define the on-disk version of dm_attrname_t. All - * on-disk attribute names start with the 8-byte string "SGI_DMI_". - * - * In the on-disk inode, DMAPI attribute names consist of the user-provided - * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be - * changed. - */ - -#define DMATTR_PREFIXLEN 8 -#define DMATTR_PREFIXSTRING "SGI_DMI_" - -typedef enum { - DM_EVENT_INVALID = -1, - DM_EVENT_CANCEL = 0, /* not supported */ - DM_EVENT_MOUNT = 1, - DM_EVENT_PREUNMOUNT = 2, - DM_EVENT_UNMOUNT = 3, - DM_EVENT_DEBUT = 4, /* not supported */ - DM_EVENT_CREATE = 5, - DM_EVENT_CLOSE = 6, /* not supported */ - DM_EVENT_POSTCREATE = 7, - DM_EVENT_REMOVE = 8, - DM_EVENT_POSTREMOVE = 9, - DM_EVENT_RENAME = 10, - DM_EVENT_POSTRENAME = 11, - DM_EVENT_LINK = 12, - DM_EVENT_POSTLINK = 13, - DM_EVENT_SYMLINK = 14, - DM_EVENT_POSTSYMLINK = 15, - DM_EVENT_READ = 16, - DM_EVENT_WRITE = 17, - DM_EVENT_TRUNCATE = 18, - DM_EVENT_ATTRIBUTE = 19, - DM_EVENT_DESTROY = 20, - DM_EVENT_NOSPACE = 21, - DM_EVENT_USER = 22, - DM_EVENT_MAX = 23 -} dm_eventtype_t; -#define HAVE_DM_EVENTTYPE_T - -typedef enum { - DM_RIGHT_NULL, - DM_RIGHT_SHARED, - DM_RIGHT_EXCL -} dm_right_t; -#define HAVE_DM_RIGHT_T - -/* Defines for determining if an event message should be sent. */ -#ifdef HAVE_DMAPI -#define DM_EVENT_ENABLED(ip, event) ( \ - unlikely ((ip)->i_mount->m_flags & XFS_MOUNT_DMAPI) && \ - ( ((ip)->i_d.di_dmevmask & (1 << event)) || \ - ((ip)->i_mount->m_dmevmask & (1 << event)) ) \ - ) -#else -#define DM_EVENT_ENABLED(ip, event) (0) -#endif - -#define DM_XFS_VALID_FS_EVENTS ( \ - (1 << DM_EVENT_PREUNMOUNT) | \ - (1 << DM_EVENT_UNMOUNT) | \ - (1 << DM_EVENT_NOSPACE) | \ - (1 << DM_EVENT_DEBUT) | \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events valid in dm_set_eventlist() when called with a file handle for - a regular file or a symlink. These events are persistent. -*/ - -#define DM_XFS_VALID_FILE_EVENTS ( \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events valid in dm_set_eventlist() when called with a file handle for - a directory. These events are persistent. -*/ - -#define DM_XFS_VALID_DIRECTORY_EVENTS ( \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events supported by the XFS filesystem. */ -#define DM_XFS_SUPPORTED_EVENTS ( \ - (1 << DM_EVENT_MOUNT) | \ - (1 << DM_EVENT_PREUNMOUNT) | \ - (1 << DM_EVENT_UNMOUNT) | \ - (1 << DM_EVENT_NOSPACE) | \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_READ) | \ - (1 << DM_EVENT_WRITE) | \ - (1 << DM_EVENT_TRUNCATE) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - - -/* - * Definitions used for the flags field on dm_send_*_event(). - */ - -#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */ -#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */ -#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */ -#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */ -#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */ - -/* - * Pull in platform specific event flags defines - */ -#include "xfs_dmapi_priv.h" - -/* - * Macros to turn caller specified delay/block flags into - * dm_send_xxxx_event flag DM_FLAGS_NDELAY. - */ - -#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \ - DM_FLAGS_NDELAY : 0) -#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0) - -#endif /* __XFS_DMAPI_H__ */ diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c deleted file mode 100644 index e71e258..0000000 --- a/fs/xfs/xfs_dmops.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_dmapi.h" -#include "xfs_inum.h" -#include "xfs_ag.h" -#include "xfs_mount.h" - - -static struct xfs_dmops xfs_dmcore_stub = { - .xfs_send_data = (xfs_send_data_t)fs_nosys, - .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr, - .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys, - .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys, - .xfs_send_mount = (xfs_send_mount_t)fs_nosys, - .xfs_send_unmount = (xfs_send_unmount_t)fs_noerr, -}; - -int -xfs_dmops_get(struct xfs_mount *mp) -{ - if (mp->m_flags & XFS_MOUNT_DMAPI) { - cmn_err(CE_WARN, - "XFS: dmapi support not available in this kernel."); - return EINVAL; - } - - mp->m_dm_ops = &xfs_dmcore_stub; - return 0; -} - -void -xfs_dmops_put(struct xfs_mount *mp) -{ -} diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 047b8a8..ed99902 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -23,12 +23,8 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_utils.h" diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 409fe81..a55e687 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -24,7 +24,6 @@ #include "xfs_buf_item.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" @@ -33,18 +32,19 @@ kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; -STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *); +static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efi_log_item, efi_item); +} void -xfs_efi_item_free(xfs_efi_log_item_t *efip) +xfs_efi_item_free( + struct xfs_efi_log_item *efip) { - int nexts = efip->efi_format.efi_nextents; - - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); - } else { + else kmem_zone_free(xfs_efi_zone, efip); - } } /* @@ -52,9 +52,9 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip) * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. */ -/*ARGSUSED*/ STATIC uint -xfs_efi_item_size(xfs_efi_log_item_t *efip) +xfs_efi_item_size( + struct xfs_log_item *lip) { return 1; } @@ -67,10 +67,12 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip) * slots in the efi item have been filled. */ STATIC void -xfs_efi_item_format(xfs_efi_log_item_t *efip, - xfs_log_iovec_t *log_vector) +xfs_efi_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) { - uint size; + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + uint size; ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); @@ -80,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip, size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); efip->efi_format.efi_size = 1; - log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); + log_vector->i_addr = &efip->efi_format; log_vector->i_len = size; log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; ASSERT(size >= sizeof(xfs_efi_log_format_t)); @@ -90,60 +92,33 @@ xfs_efi_item_format(xfs_efi_log_item_t *efip, /* * Pinning has no meaning for an efi item, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_pin(xfs_efi_log_item_t *efip) +xfs_efi_item_pin( + struct xfs_log_item *lip) { - return; } - /* * While EFIs cannot really be pinned, the unpin operation is the * last place at which the EFI is manipulated during a transaction. * Here we coordinate with xfs_efi_cancel() to determine who gets to * free the EFI. */ -/*ARGSUSED*/ -STATIC void -xfs_efi_item_unpin(xfs_efi_log_item_t *efip) -{ - struct xfs_ail *ailp = efip->efi_item.li_ailp; - - spin_lock(&ailp->xa_lock); - if (efip->efi_flags & XFS_EFI_CANCELED) { - /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); - xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_COMMITTED; - spin_unlock(&ailp->xa_lock); - } -} - -/* - * like unpin only we have to also clear the xaction descriptor - * pointing the log item if we free the item. This routine duplicates - * unpin because efi_flags is protected by the AIL lock. Freeing - * the descriptor and then calling unpin would force us to drop the AIL - * lock which would open up a race condition. - */ STATIC void -xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) +xfs_efi_item_unpin( + struct xfs_log_item *lip, + int remove) { - struct xfs_ail *ailp = efip->efi_item.li_ailp; - xfs_log_item_desc_t *lidp; + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_ail *ailp = lip->li_ailp; spin_lock(&ailp->xa_lock); if (efip->efi_flags & XFS_EFI_CANCELED) { - /* - * free the xaction descriptor pointing to this item - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip); - xfs_trans_free_item(tp, lidp); + if (remove) + xfs_trans_del_item(lip); /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); + xfs_trans_ail_delete(ailp, lip); xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_COMMITTED; @@ -158,9 +133,9 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) * XFS_ITEM_PINNED so that the caller will eventually flush the log. * This should help in getting the EFI out of the AIL. */ -/*ARGSUSED*/ STATIC uint -xfs_efi_item_trylock(xfs_efi_log_item_t *efip) +xfs_efi_item_trylock( + struct xfs_log_item *lip) { return XFS_ITEM_PINNED; } @@ -168,13 +143,12 @@ xfs_efi_item_trylock(xfs_efi_log_item_t *efip) /* * Efi items have no locking, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_unlock(xfs_efi_log_item_t *efip) +xfs_efi_item_unlock( + struct xfs_log_item *lip) { - if (efip->efi_item.li_flags & XFS_LI_ABORTED) - xfs_efi_item_free(efip); - return; + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efi_item_free(EFI_ITEM(lip)); } /* @@ -183,9 +157,10 @@ xfs_efi_item_unlock(xfs_efi_log_item_t *efip) * flag is not paid any attention here. Checking for that is delayed * until the EFI is unpinned. */ -/*ARGSUSED*/ STATIC xfs_lsn_t -xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) +xfs_efi_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { return lsn; } @@ -195,11 +170,10 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) * stuck waiting for all of its corresponding efd items to be * committed to disk. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_push(xfs_efi_log_item_t *efip) +xfs_efi_item_push( + struct xfs_log_item *lip) { - return; } /* @@ -209,61 +183,55 @@ xfs_efi_item_push(xfs_efi_log_item_t *efip) * example, for inodes, the inode is locked throughout the extent freeing * so the dependency should be recorded there. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) +xfs_efi_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - return; } /* * This is the ops vector shared by all efi log items. */ static struct xfs_item_ops xfs_efi_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_efi_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) - xfs_efi_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efi_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efi_item_committing + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_pin = xfs_efi_item_pin, + .iop_unpin = xfs_efi_item_unpin, + .iop_trylock = xfs_efi_item_trylock, + .iop_unlock = xfs_efi_item_unlock, + .iop_committed = xfs_efi_item_committed, + .iop_push = xfs_efi_item_push, + .iop_committing = xfs_efi_item_committing }; /* * Allocate and initialize an efi item with the given number of extents. */ -xfs_efi_log_item_t * -xfs_efi_init(xfs_mount_t *mp, - uint nextents) +struct xfs_efi_log_item * +xfs_efi_init( + struct xfs_mount *mp, + uint nextents) { - xfs_efi_log_item_t *efip; + struct xfs_efi_log_item *efip; uint size; ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efi_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP); + efip = kmem_zalloc(size, KM_SLEEP); } else { - efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone, - KM_SLEEP); + efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); } xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; - return (efip); + return efip; } /* @@ -276,7 +244,7 @@ xfs_efi_init(xfs_mount_t *mp, int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { - xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr; + xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; uint len = sizeof(xfs_efi_log_format_t) + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); @@ -289,8 +257,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); return 0; } else if (buf->i_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = - (xfs_efi_log_format_32_t *)buf->i_addr; + xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; @@ -304,8 +271,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } else if (buf->i_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = - (xfs_efi_log_format_64_t *)buf->i_addr; + xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; @@ -356,16 +322,18 @@ xfs_efi_release(xfs_efi_log_item_t *efip, } } -STATIC void -xfs_efd_item_free(xfs_efd_log_item_t *efdp) +static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) { - int nexts = efdp->efd_format.efd_nextents; + return container_of(lip, struct xfs_efd_log_item, efd_item); +} - if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { +STATIC void +xfs_efd_item_free(struct xfs_efd_log_item *efdp) +{ + if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); - } else { + else kmem_zone_free(xfs_efd_zone, efdp); - } } /* @@ -373,9 +341,9 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp) * We only need 1 iovec for an efd item. It just logs the efd_log_format * structure. */ -/*ARGSUSED*/ STATIC uint -xfs_efd_item_size(xfs_efd_log_item_t *efdp) +xfs_efd_item_size( + struct xfs_log_item *lip) { return 1; } @@ -388,10 +356,12 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp) * slots in the efd item have been filled. */ STATIC void -xfs_efd_item_format(xfs_efd_log_item_t *efdp, - xfs_log_iovec_t *log_vector) +xfs_efd_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) { - uint size; + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + uint size; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); @@ -401,48 +371,38 @@ xfs_efd_item_format(xfs_efd_log_item_t *efdp, size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); efdp->efd_format.efd_size = 1; - log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); + log_vector->i_addr = &efdp->efd_format; log_vector->i_len = size; log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; ASSERT(size >= sizeof(xfs_efd_log_format_t)); } - /* * Pinning has no meaning for an efd item, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_pin(xfs_efd_log_item_t *efdp) +xfs_efd_item_pin( + struct xfs_log_item *lip) { - return; } - /* * Since pinning has no meaning for an efd item, unpinning does * not either. */ -/*ARGSUSED*/ -STATIC void -xfs_efd_item_unpin(xfs_efd_log_item_t *efdp) -{ - return; -} - -/*ARGSUSED*/ STATIC void -xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp) +xfs_efd_item_unpin( + struct xfs_log_item *lip, + int remove) { - return; } /* * Efd items have no locking, so just return success. */ -/*ARGSUSED*/ STATIC uint -xfs_efd_item_trylock(xfs_efd_log_item_t *efdp) +xfs_efd_item_trylock( + struct xfs_log_item *lip) { return XFS_ITEM_LOCKED; } @@ -451,13 +411,12 @@ xfs_efd_item_trylock(xfs_efd_log_item_t *efdp) * Efd items have no locking or pushing, so return failure * so that the caller doesn't bother with us. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) +xfs_efd_item_unlock( + struct xfs_log_item *lip) { - if (efdp->efd_item.li_flags & XFS_LI_ABORTED) - xfs_efd_item_free(efdp); - return; + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efd_item_free(EFD_ITEM(lip)); } /* @@ -467,15 +426,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) * return -1 to keep the transaction code from further referencing * this item. */ -/*ARGSUSED*/ STATIC xfs_lsn_t -xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) +xfs_efd_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + /* * If we got a log I/O error, it's always the case that the LR with the * EFI got unpinned and freed before the EFD got aborted. */ - if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) + if (!(lip->li_flags & XFS_LI_ABORTED)) xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); xfs_efd_item_free(efdp); @@ -486,11 +448,10 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) * There isn't much you can do to push on an efd item. It is simply * stuck waiting for the log to be flushed to disk. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_push(xfs_efd_log_item_t *efdp) +xfs_efd_item_push( + struct xfs_log_item *lip) { - return; } /* @@ -500,55 +461,48 @@ xfs_efd_item_push(xfs_efd_log_item_t *efdp) * example, for inodes, the inode is locked throughout the extent freeing * so the dependency should be recorded there. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) +xfs_efd_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - return; } /* * This is the ops vector shared by all efd log items. */ static struct xfs_item_ops xfs_efd_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_efd_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_efd_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efd_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efd_item_committing + .iop_size = xfs_efd_item_size, + .iop_format = xfs_efd_item_format, + .iop_pin = xfs_efd_item_pin, + .iop_unpin = xfs_efd_item_unpin, + .iop_trylock = xfs_efd_item_trylock, + .iop_unlock = xfs_efd_item_unlock, + .iop_committed = xfs_efd_item_committed, + .iop_push = xfs_efd_item_push, + .iop_committing = xfs_efd_item_committing }; - /* * Allocate and initialize an efd item with the given number of extents. */ -xfs_efd_log_item_t * -xfs_efd_init(xfs_mount_t *mp, - xfs_efi_log_item_t *efip, - uint nextents) +struct xfs_efd_log_item * +xfs_efd_init( + struct xfs_mount *mp, + struct xfs_efi_log_item *efip, + uint nextents) { - xfs_efd_log_item_t *efdp; + struct xfs_efd_log_item *efdp; uint size; ASSERT(nextents > 0); if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efd_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP); + efdp = kmem_zalloc(size, KM_SLEEP); } else { - efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone, - KM_SLEEP); + efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); } xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); @@ -556,5 +510,5 @@ xfs_efd_init(xfs_mount_t *mp, efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - return (efdp); + return efdp; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 390850e..9b715dc 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -18,13 +18,9 @@ #include "xfs.h" #include "xfs_bmap_btree.h" #include "xfs_inum.h" -#include "xfs_dir2.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -127,6 +123,82 @@ typedef struct fstrm_item xfs_inode_t *pip; /* Parent directory inode pointer. */ } fstrm_item_t; +/* + * Allocation group filestream associations are tracked with per-ag atomic + * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a + * particular AG already has active filestreams associated with it. The mount + * point's m_peraglock is used to protect these counters from per-ag array + * re-allocation during a growfs operation. When xfs_growfs_data_private() is + * about to reallocate the array, it calls xfs_filestream_flush() with the + * m_peraglock held in write mode. + * + * Since xfs_mru_cache_flush() guarantees that all the free functions for all + * the cache elements have finished executing before it returns, it's safe for + * the free functions to use the atomic counters without m_peraglock protection. + * This allows the implementation of xfs_fstrm_free_func() to be agnostic about + * whether it was called with the m_peraglock held in read mode, write mode or + * not held at all. The race condition this addresses is the following: + * + * - The work queue scheduler fires and pulls a filestream directory cache + * element off the LRU end of the cache for deletion, then gets pre-empted. + * - A growfs operation grabs the m_peraglock in write mode, flushes all the + * remaining items from the cache and reallocates the mount point's per-ag + * array, resetting all the counters to zero. + * - The work queue thread resumes and calls the free function for the element + * it started cleaning up earlier. In the process it decrements the + * filestreams counter for an AG that now has no references. + * + * With a shrinkfs feature, the above scenario could panic the system. + * + * All other uses of the following macros should be protected by either the + * m_peraglock held in read mode, or the cache's internal locking exposed by the + * interval between a call to xfs_mru_cache_lookup() and a call to + * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode + * when new elements are added to the cache. + * + * Combined, these locking rules ensure that no associations will ever exist in + * the cache that reference per-ag array elements that have since been + * reallocated. + */ +static int +xfs_filestream_peek_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static int +xfs_filestream_get_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_inc_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static void +xfs_filestream_put_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + atomic_dec(&pag->pagf_fstrms); + xfs_perag_put(pag); +} /* * Scan the AGs starting at startag looking for an AG that isn't in use and has @@ -355,16 +427,14 @@ xfs_fstrm_free_func( { fstrm_item_t *item = (fstrm_item_t *)data; xfs_inode_t *ip = item->ip; - int ref; ASSERT(ip->i_ino == ino); xfs_iflags_clear(ip, XFS_IFILESTREAM); /* Drop the reference taken on the AG when the item was added. */ - ref = xfs_filestream_put_ag(ip->i_mount, item->ag); + xfs_filestream_put_ag(ip->i_mount, item->ag); - ASSERT(ref >= 0); TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, xfs_filestream_peek_ag(ip->i_mount, item->ag)); diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 260f757..09dd9af 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -42,88 +42,6 @@ extern ktrace_t *xfs_filestreams_trace_buf; #endif -/* - * Allocation group filestream associations are tracked with per-ag atomic - * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a - * particular AG already has active filestreams associated with it. The mount - * point's m_peraglock is used to protect these counters from per-ag array - * re-allocation during a growfs operation. When xfs_growfs_data_private() is - * about to reallocate the array, it calls xfs_filestream_flush() with the - * m_peraglock held in write mode. - * - * Since xfs_mru_cache_flush() guarantees that all the free functions for all - * the cache elements have finished executing before it returns, it's safe for - * the free functions to use the atomic counters without m_peraglock protection. - * This allows the implementation of xfs_fstrm_free_func() to be agnostic about - * whether it was called with the m_peraglock held in read mode, write mode or - * not held at all. The race condition this addresses is the following: - * - * - The work queue scheduler fires and pulls a filestream directory cache - * element off the LRU end of the cache for deletion, then gets pre-empted. - * - A growfs operation grabs the m_peraglock in write mode, flushes all the - * remaining items from the cache and reallocates the mount point's per-ag - * array, resetting all the counters to zero. - * - The work queue thread resumes and calls the free function for the element - * it started cleaning up earlier. In the process it decrements the - * filestreams counter for an AG that now has no references. - * - * With a shrinkfs feature, the above scenario could panic the system. - * - * All other uses of the following macros should be protected by either the - * m_peraglock held in read mode, or the cache's internal locking exposed by the - * interval between a call to xfs_mru_cache_lookup() and a call to - * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode - * when new elements are added to the cache. - * - * Combined, these locking rules ensure that no associations will ever exist in - * the cache that reference per-ag array elements that have since been - * reallocated. - */ -/* - * xfs_filestream_peek_ag is only used in tracing code - */ -static inline int -xfs_filestream_peek_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_read(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - -static inline int -xfs_filestream_get_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_inc_return(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - -static inline int -xfs_filestream_put_ag( - xfs_mount_t *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ret; - - pag = xfs_perag_get(mp, agno); - ret = atomic_dec_return(&pag->pagf_fstrms); - xfs_perag_put(pag); - return ret; -} - /* allocation selection flags */ typedef enum xfs_fstrm_alloc { XFS_PICK_USERDATA = 1, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 37a6f62..dbca5f5 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,14 +24,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -626,8 +622,7 @@ xfs_fs_log_dummy( ip = mp->m_rootip; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index c7142a0..abf80ae 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -24,14 +24,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c282a9a..d352862 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -24,14 +24,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 8f8b91b..b1ecc6f 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -25,14 +25,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -95,7 +91,7 @@ xfs_inode_alloc( return ip; } -STATIC void +void xfs_inode_free( struct xfs_inode *ip) { @@ -212,7 +208,7 @@ xfs_iget_cache_hit( ip->i_flags &= ~XFS_INEW; ip->i_flags |= XFS_IRECLAIMABLE; __xfs_inode_set_reclaim_tag(pag, ip); - trace_xfs_iget_reclaim(ip); + trace_xfs_iget_reclaim_fail(ip); goto out_error; } @@ -227,6 +223,7 @@ xfs_iget_cache_hit( } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { + trace_xfs_iget_skip(ip); error = EAGAIN; goto out_error; } @@ -234,6 +231,7 @@ xfs_iget_cache_hit( /* We've got a live one. */ spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); + trace_xfs_iget_hit(ip); } if (lock_flags != 0) @@ -242,7 +240,6 @@ xfs_iget_cache_hit( xfs_iflags_clear(ip, XFS_ISTALE); XFS_STATS_INC(xs_ig_found); - trace_xfs_iget_found(ip); return 0; out_error: @@ -264,7 +261,6 @@ xfs_iget_cache_miss( { struct xfs_inode *ip; int error; - unsigned long first_index, mask; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); ip = xfs_inode_alloc(mp, ino); @@ -275,7 +271,7 @@ xfs_iget_cache_miss( if (error) goto out_destroy; - xfs_itrace_entry(ip); + trace_xfs_iget_miss(ip); if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { error = ENOENT; @@ -301,8 +297,6 @@ xfs_iget_cache_miss( BUG(); } - mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); - first_index = agino & mask; write_lock(&pag->pag_ici_lock); /* insert the new inode */ @@ -321,7 +315,6 @@ xfs_iget_cache_miss( write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); - trace_xfs_iget_alloc(ip); *ipp = ip; return 0; @@ -422,97 +415,6 @@ out_error_or_again: } /* - * Decrement reference count of an inode structure and unlock it. - * - * ip -- the inode being released - * lock_flags -- this parameter indicates the inode's locks to be - * to be released. See the comment on xfs_iunlock() for a list - * of valid values. - */ -void -xfs_iput(xfs_inode_t *ip, - uint lock_flags) -{ - xfs_itrace_entry(ip); - xfs_iunlock(ip, lock_flags); - IRELE(ip); -} - -/* - * Special iput for brand-new inodes that are still locked - */ -void -xfs_iput_new( - xfs_inode_t *ip, - uint lock_flags) -{ - struct inode *inode = VFS_I(ip); - - xfs_itrace_entry(ip); - - if ((ip->i_d.di_mode == 0)) { - ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - make_bad_inode(inode); - } - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - if (lock_flags) - xfs_iunlock(ip, lock_flags); - IRELE(ip); -} - -/* - * This is called free all the memory associated with an inode. - * It must free the inode itself and any buffers allocated for - * if_extents/if_data and if_broot. It must also free the lock - * associated with the inode. - * - * Note: because we don't initialise everything on reallocation out - * of the zone, we must ensure we nullify everything correctly before - * freeing the structure. - */ -void -xfs_ireclaim( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - - XFS_STATS_INC(xs_ig_reclaims); - - /* - * Remove the inode from the per-AG radix tree. - * - * Because radix_tree_delete won't complain even if the item was never - * added to the tree assert that it's been there before to catch - * problems with the inode life time early on. - */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - write_lock(&pag->pag_ici_lock); - if (!radix_tree_delete(&pag->pag_ici_root, agino)) - ASSERT(0); - write_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); - - /* - * Here we do an (almost) spurious inode lock in order to coordinate - * with inode cache radix tree lookups. This is because the lookup - * can reference the inodes in the cache without taking references. - * - * We make that OK here by ensuring that we wait until the inode is - * unlocked after the lookup before we go ahead and free it. We get - * both the ilock and the iolock because the code may need to drop the - * ilock one but will still hold the iolock. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_qm_dqdetach(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - xfs_inode_free(ip); -} - -/* * This is a wrapper routine around the xfs_ilock() routine * used to centralize some grungy code. It is used in places * that wish to lock the inode solely for reading the extents. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b76a829..68415cb 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -27,13 +27,10 @@ #include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -44,7 +41,6 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" -#include "xfs_rw.h" #include "xfs_error.h" #include "xfs_utils.h" #include "xfs_quota.h" @@ -426,7 +422,7 @@ xfs_iformat( if (!XFS_DFORK_Q(dip)) return 0; ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); switch (dip->di_aformat) { @@ -509,7 +505,7 @@ xfs_iformat_local( ifp->if_u1.if_data = ifp->if_u2.if_inline_data; else { real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); } ifp->if_bytes = size; ifp->if_real_bytes = real_size; @@ -636,7 +632,7 @@ xfs_iformat_btree( } ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP); + ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); ASSERT(ifp->if_broot != NULL); /* * Copy and convert from the on-disk structure @@ -922,7 +918,6 @@ xfs_iread_extents( int error; xfs_ifork_t *ifp; xfs_extnum_t nextents; - size_t size; if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, @@ -930,7 +925,6 @@ xfs_iread_extents( return XFS_ERROR(EFSCORRUPTED); } nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - size = nextents * sizeof(xfs_bmbt_rec_t); ifp = XFS_IFORK_PTR(ip, whichfork); /* @@ -1226,7 +1220,7 @@ xfs_isize_check( (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, - NULL, NULL)) + NULL)) return; ASSERT(nimaps == 1); ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); @@ -1460,7 +1454,7 @@ xfs_itruncate_finish( ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_transp == *tp); ASSERT(ip->i_itemp != NULL); - ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); + ASSERT(ip->i_itemp->ili_lock_flags == 0); ntp = *tp; @@ -1589,11 +1583,10 @@ xfs_itruncate_finish( xfs_bmap_init(&free_list, &first_block); error = xfs_bunmapi(ntp, ip, first_unmap_block, unmap_len, - xfs_bmapi_aflag(fork) | - (sync ? 0 : XFS_BMAPI_ASYNC), + xfs_bmapi_aflag(fork), XFS_ITRUNC_MAX_EXTENTS, &first_block, &free_list, - NULL, &done); + &done); if (error) { /* * If the bunmapi call encounters an error, @@ -1612,12 +1605,8 @@ xfs_itruncate_finish( */ error = xfs_bmap_finish(tp, &free_list, &committed); ntp = *tp; - if (committed) { - /* link the inode into the next xact in the chain */ - xfs_trans_ijoin(ntp, ip, - XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); - } + if (committed) + xfs_trans_ijoin(ntp, ip); if (error) { /* @@ -1646,9 +1635,7 @@ xfs_itruncate_finish( error = xfs_trans_commit(*tp, 0); *tp = ntp; - /* link the inode into the next transaction in the chain */ - xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); + xfs_trans_ijoin(ntp, ip); if (error) return error; @@ -1985,7 +1972,7 @@ xfs_ifree_cluster( if (lip->li_type == XFS_LI_INODE) { iip = (xfs_inode_log_item_t *)lip; ASSERT(iip->ili_logged == 1); - lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; + lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -2055,9 +2042,8 @@ xfs_ifree_cluster( xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); - xfs_buf_attach_iodone(bp, - (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_istale_done, (xfs_log_item_t *)iip); + xfs_buf_attach_iodone(bp, xfs_istale_done, + &iip->ili_item); if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -2203,7 +2189,7 @@ xfs_iroot_realloc( */ if (ifp->if_broot_bytes == 0) { new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_SLEEP); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); ifp->if_broot_bytes = (int)new_size; return; } @@ -2219,7 +2205,7 @@ xfs_iroot_realloc( new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ - KM_SLEEP); + KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -2245,7 +2231,7 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_SLEEP); + new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); /* * First copy over the btree block header. */ @@ -2349,7 +2335,8 @@ xfs_idata_realloc( real_size = roundup(new_size, 4); if (ifp->if_u1.if_data == NULL) { ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { /* * Only do the realloc if the underlying size @@ -2360,11 +2347,12 @@ xfs_idata_realloc( kmem_realloc(ifp->if_u1.if_data, real_size, ifp->if_real_bytes, - KM_SLEEP); + KM_SLEEP | KM_NOFS); } } else { ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, ifp->if_bytes); } @@ -2731,7 +2719,6 @@ cluster_corrupt_out: * mark it as stale and brelse. */ if (XFS_BUF_IODONE_FUNC(bp)) { - XFS_BUF_CLR_BDSTRAT_FUNC(bp); XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); XFS_BUF_ERROR(bp,EIO); @@ -3069,8 +3056,7 @@ xfs_iflush_int( * and unlock the inode's flush lock when the inode is * completely written to disk. */ - xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_iflush_done, (xfs_log_item_t *)iip); + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); @@ -3514,13 +3500,11 @@ xfs_iext_remove_indirect( xfs_extnum_t ext_diff; /* extents to remove in current list */ xfs_extnum_t nex1; /* number of extents before idx */ xfs_extnum_t nex2; /* extents after idx + count */ - int nlists; /* entries in indirection array */ int page_idx = idx; /* index in target extent list */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); ASSERT(erp != NULL); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; nex1 = page_idx; ext_cnt = count; while (ext_cnt) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 78550df..0898c54 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -443,8 +443,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) */ int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, uint, uint, xfs_inode_t **); -void xfs_iput(xfs_inode_t *, uint); -void xfs_iput_new(xfs_inode_t *, uint); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); @@ -452,7 +450,7 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_ireclaim(xfs_inode_t *); +void xfs_inode_free(struct xfs_inode *ip); /* * xfs_inode.c prototypes. diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index cf8249a..fe00777 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -22,30 +22,26 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" -#include "xfs_buf_item.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rw.h" #include "xfs_error.h" #include "xfs_trace.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ +static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_inode_log_item, ili_item); +} + + /* * This returns the number of iovecs needed to log the given inode item. * @@ -55,13 +51,11 @@ kmem_zone_t *xfs_ili_zone; /* inode log item zone */ */ STATIC uint xfs_inode_item_size( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - uint nvecs; - xfs_inode_t *ip; - - ip = iip->ili_inode; - nvecs = 2; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + uint nvecs = 2; /* * Only log the data/extents/b-tree root if there is something @@ -212,21 +206,17 @@ xfs_inode_item_size( */ STATIC void xfs_inode_item_format( - xfs_inode_log_item_t *iip, - xfs_log_iovec_t *log_vector) + struct xfs_log_item *lip, + struct xfs_log_iovec *vecp) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; uint nvecs; - xfs_log_iovec_t *vecp; - xfs_inode_t *ip; size_t data_bytes; xfs_bmbt_rec_t *ext_buffer; - int nrecs; xfs_mount_t *mp; - ip = iip->ili_inode; - vecp = log_vector; - - vecp->i_addr = (xfs_caddr_t)&iip->ili_format; + vecp->i_addr = &iip->ili_format; vecp->i_len = sizeof(xfs_inode_log_format_t); vecp->i_type = XLOG_REG_TYPE_IFORMAT; vecp++; @@ -277,7 +267,7 @@ xfs_inode_item_format( */ xfs_synchronize_times(ip); - vecp->i_addr = (xfs_caddr_t)&ip->i_d; + vecp->i_addr = &ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); vecp->i_type = XLOG_REG_TYPE_ICORE; vecp++; @@ -323,18 +313,17 @@ xfs_inode_item_format( ASSERT(ip->i_df.if_u1.if_extents != NULL); ASSERT(ip->i_d.di_nextents > 0); ASSERT(iip->ili_extents_buf == NULL); - nrecs = ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nrecs > 0); + ASSERT((ip->i_df.if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)) > 0); #ifdef XFS_NATIVE_HOST - if (nrecs == ip->i_d.di_nextents) { + if (ip->i_d.di_nextents == ip->i_df.if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)) { /* * There are no delayed allocation * extents, so just point to the * real extents array. */ - vecp->i_addr = - (char *)(ip->i_df.if_u1.if_extents); + vecp->i_addr = ip->i_df.if_u1.if_extents; vecp->i_len = ip->i_df.if_bytes; vecp->i_type = XLOG_REG_TYPE_IEXT; } else @@ -352,7 +341,7 @@ xfs_inode_item_format( ext_buffer = kmem_alloc(ip->i_df.if_bytes, KM_SLEEP); iip->ili_extents_buf = ext_buffer; - vecp->i_addr = (xfs_caddr_t)ext_buffer; + vecp->i_addr = ext_buffer; vecp->i_len = xfs_iextents_copy(ip, ext_buffer, XFS_DATA_FORK); vecp->i_type = XLOG_REG_TYPE_IEXT; @@ -371,7 +360,7 @@ xfs_inode_item_format( if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { ASSERT(ip->i_df.if_broot_bytes > 0); ASSERT(ip->i_df.if_broot != NULL); - vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; + vecp->i_addr = ip->i_df.if_broot; vecp->i_len = ip->i_df.if_broot_bytes; vecp->i_type = XLOG_REG_TYPE_IBROOT; vecp++; @@ -389,7 +378,7 @@ xfs_inode_item_format( ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_d.di_size > 0); - vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data; + vecp->i_addr = ip->i_df.if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to @@ -437,7 +426,7 @@ xfs_inode_item_format( * Assert that no attribute-related log flags are set. */ if (!XFS_IFORK_Q(ip)) { - ASSERT(nvecs == iip->ili_item.li_desc->lid_size); + ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; ASSERT(!(iip->ili_format.ilf_fields & (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); @@ -449,21 +438,21 @@ xfs_inode_item_format( ASSERT(!(iip->ili_format.ilf_fields & (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { - ASSERT(ip->i_afp->if_bytes > 0); - ASSERT(ip->i_afp->if_u1.if_extents != NULL); - ASSERT(ip->i_d.di_anextents > 0); #ifdef DEBUG - nrecs = ip->i_afp->if_bytes / + int nrecs = ip->i_afp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); -#endif ASSERT(nrecs > 0); ASSERT(nrecs == ip->i_d.di_anextents); + ASSERT(ip->i_afp->if_bytes > 0); + ASSERT(ip->i_afp->if_u1.if_extents != NULL); + ASSERT(ip->i_d.di_anextents > 0); +#endif #ifdef XFS_NATIVE_HOST /* * There are not delayed allocation extents * for attributes, so just point at the array. */ - vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents); + vecp->i_addr = ip->i_afp->if_u1.if_extents; vecp->i_len = ip->i_afp->if_bytes; #else ASSERT(iip->ili_aextents_buf == NULL); @@ -473,7 +462,7 @@ xfs_inode_item_format( ext_buffer = kmem_alloc(ip->i_afp->if_bytes, KM_SLEEP); iip->ili_aextents_buf = ext_buffer; - vecp->i_addr = (xfs_caddr_t)ext_buffer; + vecp->i_addr = ext_buffer; vecp->i_len = xfs_iextents_copy(ip, ext_buffer, XFS_ATTR_FORK); #endif @@ -490,7 +479,7 @@ xfs_inode_item_format( if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { ASSERT(ip->i_afp->if_broot_bytes > 0); ASSERT(ip->i_afp->if_broot != NULL); - vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; + vecp->i_addr = ip->i_afp->if_broot; vecp->i_len = ip->i_afp->if_broot_bytes; vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; vecp++; @@ -506,7 +495,7 @@ xfs_inode_item_format( ASSERT(ip->i_afp->if_bytes > 0); ASSERT(ip->i_afp->if_u1.if_data != NULL); - vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data; + vecp->i_addr = ip->i_afp->if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to @@ -528,7 +517,7 @@ xfs_inode_item_format( break; } - ASSERT(nvecs == iip->ili_item.li_desc->lid_size); + ASSERT(nvecs == lip->li_desc->lid_size); iip->ili_format.ilf_size = nvecs; } @@ -539,12 +528,14 @@ xfs_inode_item_format( */ STATIC void xfs_inode_item_pin( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - trace_xfs_inode_pin(iip->ili_inode, _RET_IP_); - atomic_inc(&iip->ili_inode->i_pincount); + trace_xfs_inode_pin(ip, _RET_IP_); + atomic_inc(&ip->i_pincount); } @@ -554,12 +545,12 @@ xfs_inode_item_pin( * * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. */ -/* ARGSUSED */ STATIC void xfs_inode_item_unpin( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip, + int remove) { - struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); @@ -567,15 +558,6 @@ xfs_inode_item_unpin( wake_up(&ip->i_ipin_wait); } -/* ARGSUSED */ -STATIC void -xfs_inode_item_unpin_remove( - xfs_inode_log_item_t *iip, - xfs_trans_t *tp) -{ - xfs_inode_item_unpin(iip); -} - /* * This is called to attempt to lock the inode associated with this * inode log item, in preparation for the push routine which does the actual @@ -591,19 +573,16 @@ xfs_inode_item_unpin_remove( */ STATIC uint xfs_inode_item_trylock( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - register xfs_inode_t *ip; - - ip = iip->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; - if (xfs_ipincount(ip) > 0) { + if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; - } - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; - } if (!xfs_iflock_nowait(ip)) { /* @@ -629,7 +608,7 @@ xfs_inode_item_trylock( if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { ASSERT(iip->ili_format.ilf_fields != 0); ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL); + ASSERT(lip->li_flags & XFS_LI_IN_AIL); } #endif return XFS_ITEM_SUCCESS; @@ -643,26 +622,18 @@ xfs_inode_item_trylock( */ STATIC void xfs_inode_item_unlock( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - uint hold; - uint iolocked; - uint lock_flags; - xfs_inode_t *ip; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + unsigned short lock_flags; - ASSERT(iip != NULL); ASSERT(iip->ili_inode->i_itemp != NULL); ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); - ASSERT((!(iip->ili_inode->i_itemp->ili_flags & - XFS_ILI_IOLOCKED_EXCL)) || - xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL)); - ASSERT((!(iip->ili_inode->i_itemp->ili_flags & - XFS_ILI_IOLOCKED_SHARED)) || - xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED)); + /* * Clear the transaction pointer in the inode. */ - ip = iip->ili_inode; ip->i_transp = NULL; /* @@ -686,34 +657,11 @@ xfs_inode_item_unlock( iip->ili_aextents_buf = NULL; } - /* - * Figure out if we should unlock the inode or not. - */ - hold = iip->ili_flags & XFS_ILI_HOLD; - - /* - * Before clearing out the flags, remember whether we - * are holding the inode's IO lock. - */ - iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY; - - /* - * Clear out the fields of the inode log item particular - * to the current transaction. - */ - iip->ili_flags = 0; - - /* - * Unlock the inode if XFS_ILI_HOLD was not set. - */ - if (!hold) { - lock_flags = XFS_ILOCK_EXCL; - if (iolocked & XFS_ILI_IOLOCKED_EXCL) { - lock_flags |= XFS_IOLOCK_EXCL; - } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) { - lock_flags |= XFS_IOLOCK_SHARED; - } - xfs_iput(iip->ili_inode, lock_flags); + lock_flags = iip->ili_lock_flags; + iip->ili_lock_flags = 0; + if (lock_flags) { + xfs_iunlock(iip->ili_inode, lock_flags); + IRELE(iip->ili_inode); } } @@ -725,13 +673,12 @@ xfs_inode_item_unlock( * is the only one that matters. Therefore, simply return the * given lsn. */ -/*ARGSUSED*/ STATIC xfs_lsn_t xfs_inode_item_committed( - xfs_inode_log_item_t *iip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - return (lsn); + return lsn; } /* @@ -743,13 +690,12 @@ xfs_inode_item_committed( */ STATIC void xfs_inode_item_pushbuf( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - xfs_inode_t *ip; - xfs_mount_t *mp; - xfs_buf_t *bp; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp; - ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); /* @@ -757,14 +703,13 @@ xfs_inode_item_pushbuf( * inode was taken off the AIL. So, just get out. */ if (completion_done(&ip->i_flush) || - ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { + !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return; } - mp = ip->i_mount; - bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XBF_TRYLOCK); + bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, + iip->ili_format.ilf_len, XBF_TRYLOCK); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!bp) @@ -772,10 +717,8 @@ xfs_inode_item_pushbuf( if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); - return; } - /* * This is called to asynchronously write the inode associated with this * inode log item out to disk. The inode will already have been locked by @@ -783,14 +726,14 @@ xfs_inode_item_pushbuf( */ STATIC void xfs_inode_item_push( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - xfs_inode_t *ip; - - ip = iip->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); ASSERT(!completion_done(&ip->i_flush)); + /* * Since we were able to lock the inode's flush lock and * we found it on the AIL, the inode must be dirty. This @@ -813,43 +756,34 @@ xfs_inode_item_push( */ (void) xfs_iflush(ip, 0); xfs_iunlock(ip, XFS_ILOCK_SHARED); - - return; } /* * XXX rcc - this one really has to do something. Probably needs * to stamp in a new field in the incore inode. */ -/* ARGSUSED */ STATIC void xfs_inode_item_committing( - xfs_inode_log_item_t *iip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - iip->ili_last_lsn = lsn; - return; + INODE_ITEM(lip)->ili_last_lsn = lsn; } /* * This is the ops vector shared by all buf log items. */ static struct xfs_item_ops xfs_inode_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_inode_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_inode_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_inode_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push, - .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_inode_item_committing + .iop_size = xfs_inode_item_size, + .iop_format = xfs_inode_item_format, + .iop_pin = xfs_inode_item_pin, + .iop_unpin = xfs_inode_item_unpin, + .iop_trylock = xfs_inode_item_trylock, + .iop_unlock = xfs_inode_item_unlock, + .iop_committed = xfs_inode_item_committed, + .iop_push = xfs_inode_item_push, + .iop_pushbuf = xfs_inode_item_pushbuf, + .iop_committing = xfs_inode_item_committing }; @@ -858,10 +792,10 @@ static struct xfs_item_ops xfs_inode_item_ops = { */ void xfs_inode_item_init( - xfs_inode_t *ip, - xfs_mount_t *mp) + struct xfs_inode *ip, + struct xfs_mount *mp) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); @@ -899,14 +833,14 @@ xfs_inode_item_destroy( * from the AIL if it has not been re-logged, and unlocking the inode's * flush lock. */ -/*ARGSUSED*/ void xfs_iflush_done( - xfs_buf_t *bp, - xfs_inode_log_item_t *iip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); xfs_inode_t *ip = iip->ili_inode; - struct xfs_ail *ailp = iip->ili_item.li_ailp; + struct xfs_ail *ailp = lip->li_ailp; /* * We only want to pull the item from the AIL if it is @@ -917,12 +851,11 @@ xfs_iflush_done( * the lock since it's cheaper, and then we recheck while * holding the lock before removing the inode from the AIL. */ - if (iip->ili_logged && - (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { + if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { spin_lock(&ailp->xa_lock); - if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { + if (lip->li_lsn == iip->ili_flush_lsn) { /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip); + xfs_trans_ail_delete(ailp, lip); } else { spin_unlock(&ailp->xa_lock); } @@ -940,8 +873,6 @@ xfs_iflush_done( * Release the inode's flush lock since we're done with it. */ xfs_ifunlock(ip); - - return; } /* @@ -957,10 +888,8 @@ xfs_iflush_abort( xfs_inode_t *ip) { xfs_inode_log_item_t *iip = ip->i_itemp; - xfs_mount_t *mp; iip = ip->i_itemp; - mp = ip->i_mount; if (iip) { struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { @@ -991,10 +920,10 @@ xfs_iflush_abort( void xfs_istale_done( - xfs_buf_t *bp, - xfs_inode_log_item_t *iip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { - xfs_iflush_abort(iip->ili_inode); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); } /* @@ -1007,9 +936,8 @@ xfs_inode_item_format_convert( xfs_inode_log_format_t *in_f) { if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { - xfs_inode_log_format_32_t *in_f32; + xfs_inode_log_format_32_t *in_f32 = buf->i_addr; - in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr; in_f->ilf_type = in_f32->ilf_type; in_f->ilf_size = in_f32->ilf_size; in_f->ilf_fields = in_f32->ilf_fields; @@ -1025,9 +953,8 @@ xfs_inode_item_format_convert( in_f->ilf_boffset = in_f32->ilf_boffset; return 0; } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ - xfs_inode_log_format_64_t *in_f64; + xfs_inode_log_format_64_t *in_f64 = buf->i_addr; - in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr; in_f->ilf_type = in_f64->ilf_type; in_f->ilf_size = in_f64->ilf_size; in_f->ilf_fields = in_f64->ilf_fields; diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 9a46795..d3dee61e 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -103,12 +103,6 @@ typedef struct xfs_inode_log_format_64 { XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ XFS_ILOG_ABROOT) -#define XFS_ILI_HOLD 0x1 -#define XFS_ILI_IOLOCKED_EXCL 0x2 -#define XFS_ILI_IOLOCKED_SHARED 0x4 - -#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) - static inline int xfs_ilog_fbroot(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); @@ -137,7 +131,7 @@ typedef struct xfs_inode_log_item { struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ - unsigned short ili_flags; /* misc flags */ + unsigned short ili_lock_flags; /* lock flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ struct xfs_bmbt_rec *ili_extents_buf; /* array of logged @@ -161,8 +155,8 @@ static inline int xfs_inode_clean(xfs_inode_t *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); -extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *); +extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ef14943..2057614 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -23,19 +23,14 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" #include "xfs_alloc.h" -#include "xfs_dmapi.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -123,7 +118,7 @@ xfs_iomap( error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(end_fsb - offset_fsb), bmapi_flags, NULL, 0, imap, - nimaps, NULL, NULL); + nimaps, NULL); if (error) goto out; @@ -138,7 +133,7 @@ xfs_iomap( break; } - if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { + if (flags & BMAPI_DIRECT) { error = xfs_iomap_write_direct(ip, offset, count, flags, imap, nimaps); } else { @@ -247,7 +242,7 @@ xfs_iomap_write_direct( xfs_off_t offset, size_t count, int flags, - xfs_bmbt_irec_t *ret_imap, + xfs_bmbt_irec_t *imap, int *nmaps) { xfs_mount_t *mp = ip->i_mount; @@ -261,7 +256,6 @@ xfs_iomap_write_direct( int quota_flag; int rt; xfs_trans_t *tp; - xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; int committed; @@ -285,10 +279,10 @@ xfs_iomap_write_direct( if (error) goto error_out; } else { - if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK)) + if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) - ret_imap->br_blockcount + - ret_imap->br_startoff); + imap->br_blockcount + + imap->br_startoff); } count_fsb = last_fsb - offset_fsb; ASSERT(count_fsb > 0); @@ -334,20 +328,22 @@ xfs_iomap_write_direct( if (error) goto error1; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); bmapi_flag = XFS_BMAPI_WRITE; if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) bmapi_flag |= XFS_BMAPI_PREALLOC; /* - * Issue the xfs_bmapi() call to allocate the blocks + * Issue the xfs_bmapi() call to allocate the blocks. + * + * From this point onwards we overwrite the imap pointer that the + * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, &imap, &nimaps, &free_list, NULL); + &firstfsb, 0, imap, &nimaps, &free_list); if (error) goto error0; @@ -369,12 +365,11 @@ xfs_iomap_write_direct( goto error_out; } - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) { - error = xfs_cmn_err_fsblock_zero(ip, &imap); + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { + error = xfs_cmn_err_fsblock_zero(ip, imap); goto error_out; } - *ret_imap = imap; *nmaps = 1; return 0; @@ -425,7 +420,7 @@ xfs_iomap_eof_want_preallocate( imaps = nimaps; firstblock = NULLFSBLOCK; error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0, - &firstblock, 0, imap, &imaps, NULL, NULL); + &firstblock, 0, imap, &imaps, NULL); if (error) return error; for (n = 0; n < imaps; n++) { @@ -500,7 +495,7 @@ retry: (xfs_filblks_t)(last_fsb - offset_fsb), XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | XFS_BMAPI_ENTIRE, &firstblock, 1, imap, - &nimaps, NULL, NULL); + &nimaps, NULL); if (error && (error != ENOSPC)) return XFS_ERROR(error); @@ -548,7 +543,7 @@ xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, size_t count, - xfs_bmbt_irec_t *map, + xfs_bmbt_irec_t *imap, int *retmap) { xfs_mount_t *mp = ip->i_mount; @@ -557,7 +552,6 @@ xfs_iomap_write_allocate( xfs_fsblock_t first_block; xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; - xfs_bmbt_irec_t imap; xfs_trans_t *tp; int nimaps, committed; int error = 0; @@ -573,8 +567,8 @@ xfs_iomap_write_allocate( return XFS_ERROR(error); offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = map->br_blockcount; - map_start_fsb = map->br_startoff; + count_fsb = imap->br_blockcount; + map_start_fsb = imap->br_startoff; XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); @@ -602,8 +596,7 @@ xfs_iomap_write_allocate( return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_bmap_init(&free_list, &first_block); @@ -654,10 +647,15 @@ xfs_iomap_write_allocate( } } - /* Go get the actual blocks */ + /* + * Go get the actual blocks. + * + * From this point onwards we overwrite the imap + * pointer that the caller gave to us. + */ error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb, XFS_BMAPI_WRITE, &first_block, 1, - &imap, &nimaps, &free_list, NULL); + imap, &nimaps, &free_list); if (error) goto trans_cancel; @@ -676,13 +674,12 @@ xfs_iomap_write_allocate( * See if we were able to allocate an extent that * covers at least part of the callers request */ - if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_cmn_err_fsblock_zero(ip, &imap); + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_cmn_err_fsblock_zero(ip, imap); - if ((offset_fsb >= imap.br_startoff) && - (offset_fsb < (imap.br_startoff + - imap.br_blockcount))) { - *map = imap; + if ((offset_fsb >= imap->br_startoff) && + (offset_fsb < (imap->br_startoff + + imap->br_blockcount))) { *retmap = 1; XFS_STATS_INC(xs_xstrat_quick); return 0; @@ -692,8 +689,8 @@ xfs_iomap_write_allocate( * So far we have not mapped the requested part of the * file, just surrounding data, try again. */ - count_fsb -= imap.br_blockcount; - map_start_fsb = imap.br_startoff + imap.br_blockcount; + count_fsb -= imap->br_blockcount; + map_start_fsb = imap->br_startoff + imap->br_blockcount; } trans_cancel: @@ -766,8 +763,7 @@ xfs_iomap_write_unwritten( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * Modify the unwritten extent state of the buffer. @@ -776,7 +772,7 @@ xfs_iomap_write_unwritten( nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, - 1, &imap, &nimaps, &free_list, NULL); + 1, &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 81ac4af..7748a43 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,17 +18,16 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -typedef enum { - /* base extent manipulation calls */ - BMAPI_READ = (1 << 0), /* read extents */ - BMAPI_WRITE = (1 << 1), /* create extents */ - BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */ - /* modifiers */ - BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ - BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ - BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ - BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */ -} bmapi_flags_t; +/* base extent manipulation calls */ +#define BMAPI_READ (1 << 0) /* read extents */ +#define BMAPI_WRITE (1 << 1) /* create extents */ +#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */ + +/* modifiers */ +#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */ +#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */ +#define BMAPI_MMA (1 << 6) /* allocate for mmap write */ +#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */ #define BMAPI_FLAGS \ { BMAPI_READ, "READ" }, \ @@ -36,7 +35,6 @@ typedef enum { { BMAPI_ALLOCATE, "ALLOCATE" }, \ { BMAPI_IGNSTATE, "IGNSTATE" }, \ { BMAPI_DIRECT, "DIRECT" }, \ - { BMAPI_MMAP, "MMAP" }, \ { BMAPI_TRYLOCK, "TRYLOCK" } struct xfs_inode; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 2b86f86..7e3626e 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -24,20 +24,17 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_error.h" #include "xfs_btree.h" +#include "xfs_trace.h" STATIC int xfs_internal_inum( @@ -143,7 +140,8 @@ xfs_bulkstat_one_int( buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; break; } - xfs_iput(ip, XFS_ILOCK_SHARED); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + IRELE(ip); error = formatter(buffer, ubsize, ubused, buf); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5215abc..925d572 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -24,8 +24,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_log_priv.h" @@ -35,8 +33,6 @@ #include "xfs_ialloc_btree.h" #include "xfs_log_recover.h" #include "xfs_trans_priv.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_rw.h" @@ -337,7 +333,6 @@ xfs_log_reserve( int retval = 0; ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - ASSERT((flags & XFS_LOG_NOSLEEP) == 0); if (XLOG_FORCED_SHUTDOWN(log)) return XFS_ERROR(EIO); @@ -552,7 +547,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = (void *)&magic, + .i_addr = &magic, .i_len = sizeof(magic), .i_type = XLOG_REG_TYPE_UNMOUNT, }; @@ -1047,7 +1042,6 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_in_core_t *iclog, *prev_iclog=NULL; xfs_buf_t *bp; int i; - int iclogsize; int error = ENOMEM; uint log2_size = 0; @@ -1127,7 +1121,6 @@ xlog_alloc_log(xfs_mount_t *mp, * with different amounts of memory. See the definition of * xlog_in_core_t in xfs_log_priv.h for details. */ - iclogsize = log->l_iclog_size; ASSERT(log->l_iclog_size >= 4096); for (i=0; i < log->l_iclog_bufs; i++) { *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); @@ -1428,11 +1421,8 @@ xlog_sync(xlog_t *log, XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_LOG_BUFFER; - /* - * Do an ordered write for the log block. - * Its unnecessary to flush the first split block in the log wrap case. - */ - if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER)) + + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) XFS_BUF_ORDERED(bp); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 04c78e6..916eb7d 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -55,14 +55,10 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) /* * Flags to xfs_log_reserve() * - * XFS_LOG_SLEEP: If space is not available, sleep (default) - * XFS_LOG_NOSLEEP: If space is not available, return error * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are * performed against this type of reservation, the reservation * is not decreased. Long running transactions should use this. */ -#define XFS_LOG_SLEEP 0x0 -#define XFS_LOG_NOSLEEP 0x1 #define XFS_LOG_PERM_RESERV 0x2 /* @@ -104,7 +100,7 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XLOG_REG_TYPE_MAX 19 typedef struct xfs_log_iovec { - xfs_caddr_t i_addr; /* beginning address of region */ + void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ } xfs_log_iovec_t; @@ -201,9 +197,4 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); #endif - - -extern int xlog_debug; /* set to 1 to enable real log */ - - #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index bb17cc0..31e4ea2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -26,8 +26,6 @@ #include "xfs_log_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" @@ -554,7 +552,7 @@ xlog_cil_push( thdr.th_type = XFS_TRANS_CHECKPOINT; thdr.th_tid = tic->t_tid; thdr.th_num_items = num_iovecs; - lhdr.i_addr = (xfs_caddr_t)&thdr; + lhdr.i_addr = &thdr; lhdr.i_len = sizeof(xfs_trans_header_t); lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9ac5cfa..6f3f5fa 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -24,15 +24,11 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -1565,9 +1561,7 @@ xlog_recover_reorder_trans( list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { - xfs_buf_log_format_t *buf_f; - - buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; switch (ITEM_TYPE(item)) { case XFS_LI_BUF: @@ -1892,9 +1886,8 @@ xlog_recover_do_inode_buffer( * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ - logged_nextp = (xfs_agino_t *) - ((char *)(item->ri_buf[item_index].i_addr) + - (next_unlinked_offset - reg_buf_offset)); + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; if (unlikely(*logged_nextp == 0)) { xfs_fs_cmn_err(CE_ALERT, mp, "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", @@ -1973,8 +1966,7 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_qm_dqcheck((xfs_disk_dquot_t *) - item->ri_buf[i].i_addr, + error = xfs_qm_dqcheck(item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); if (error) @@ -2187,7 +2179,7 @@ xlog_recover_do_buffer_trans( xlog_recover_item_t *item, int pass) { - xfs_buf_log_format_t *buf_f; + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp; xfs_buf_t *bp; int error; @@ -2197,8 +2189,6 @@ xlog_recover_do_buffer_trans( ushort flags; uint buf_flags; - buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; - if (pass == XLOG_RECOVER_PASS1) { /* * In this pass we're only looking for buf items @@ -2319,10 +2309,9 @@ xlog_recover_do_inode_trans( } if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { - in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr; + in_f = item->ri_buf[0].i_addr; } else { - in_f = (xfs_inode_log_format_t *)kmem_alloc( - sizeof(xfs_inode_log_format_t), KM_SLEEP); + in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) @@ -2370,7 +2359,7 @@ xlog_recover_do_inode_trans( error = EFSCORRUPTED; goto error; } - dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr); + dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, @@ -2461,7 +2450,7 @@ xlog_recover_do_inode_trans( } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); /* the rest is in on-disk format */ if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { @@ -2578,7 +2567,7 @@ xlog_recover_do_quotaoff_trans( return (0); } - qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr; + qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); /* @@ -2622,9 +2611,8 @@ xlog_recover_do_dquot_trans( if (mp->m_qflags == 0) return (0); - recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; - - if (item->ri_buf[1].i_addr == NULL) { + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { cmn_err(CE_ALERT, "XFS: NULL dquot in %s.", __func__); return XFS_ERROR(EIO); @@ -2654,7 +2642,7 @@ xlog_recover_do_dquot_trans( * The other possibility, of course, is that the quota subsystem was * removed since the last mount - ENOSYS. */ - dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr; + dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); if ((error = xfs_qm_dqcheck(recddq, dq_f->qlf_id, @@ -2721,7 +2709,7 @@ xlog_recover_do_efi_trans( return 0; } - efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr; + efi_formatp = item->ri_buf[0].i_addr; mp = log->l_mp; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); @@ -2767,7 +2755,7 @@ xlog_recover_do_efd_trans( return; } - efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr; + efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 69f62d8..aeb9d72 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -25,13 +25,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1d2c7ee..622da21 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -66,65 +66,6 @@ struct xfs_nameops; struct xfs_ail; struct xfs_quotainfo; - -/* - * Prototypes and functions for the Data Migration subsystem. - */ - -typedef int (*xfs_send_data_t)(int, struct xfs_inode *, - xfs_off_t, size_t, int, int *); -typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint); -typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t); -typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, - struct xfs_inode *, dm_right_t, - struct xfs_inode *, dm_right_t, - const unsigned char *, const unsigned char *, - mode_t, int, int); -typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, - char *, char *); -typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *, - dm_right_t, mode_t, int, int); - -typedef struct xfs_dmops { - xfs_send_data_t xfs_send_data; - xfs_send_mmap_t xfs_send_mmap; - xfs_send_destroy_t xfs_send_destroy; - xfs_send_namesp_t xfs_send_namesp; - xfs_send_mount_t xfs_send_mount; - xfs_send_unmount_t xfs_send_unmount; -} xfs_dmops_t; - -#define XFS_DMAPI_UNMOUNT_FLAGS(mp) \ - (((mp)->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ? 0 : DM_FLAGS_UNWANTED) - -#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \ - (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock) -#define XFS_SEND_MMAP(mp, vma,fl) \ - (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl) -#define XFS_SEND_DESTROY(mp, ip,right) \ - (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right) -#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ - (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) -#define XFS_SEND_MOUNT(mp,right,path,name) \ - (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name) -#define XFS_SEND_PREUNMOUNT(mp) \ -do { \ - if (mp->m_flags & XFS_MOUNT_DMAPI) { \ - (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT, mp, \ - (mp)->m_rootip, DM_RIGHT_NULL, \ - (mp)->m_rootip, DM_RIGHT_NULL, \ - NULL, NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \ - } \ -} while (0) -#define XFS_SEND_UNMOUNT(mp) \ -do { \ - if (mp->m_flags & XFS_MOUNT_DMAPI) { \ - (*(mp)->m_dm_ops->xfs_send_unmount)(mp, (mp)->m_rootip, \ - DM_RIGHT_NULL, 0, 0, XFS_DMAPI_UNMOUNT_FLAGS(mp)); \ - } \ -} while (0) - - #ifdef HAVE_PERCPU_SB /* @@ -241,8 +182,6 @@ typedef struct xfs_mount { uint m_chsize; /* size of next field */ struct xfs_chash *m_chash; /* fs private inode per-cluster * hash table */ - struct xfs_dmops *m_dm_ops; /* vector of DMI ops */ - struct xfs_qmops *m_qm_ops; /* vector of XQM ops */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ @@ -259,7 +198,7 @@ typedef struct xfs_mount { wait_queue_head_t m_wait_single_sync_task; __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ - struct list_head m_mplist; /* inode shrinker mount list */ + struct shrinker m_inode_shrink; /* inode reclaim shrinker */ } xfs_mount_t; /* @@ -269,7 +208,6 @@ typedef struct xfs_mount { must be synchronous except for space allocations */ #define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ -#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -282,8 +220,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ #define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ -#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */ - /* osyncisdsync is now default*/ #define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above * 32 bits in size */ #define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ @@ -440,11 +376,6 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); -extern int xfs_dmops_get(struct xfs_mount *); -extern void xfs_dmops_put(struct xfs_mount *); - -extern struct xfs_dmops xfs_dmcore_xfs; - #endif /* __KERNEL__ */ extern void xfs_mod_sb(struct xfs_trans *, __int64_t); diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index fc1cda2..8fca957 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -24,12 +24,9 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -116,20 +113,7 @@ xfs_rename( int spaceres; int num_inodes; - xfs_itrace_entry(src_dp); - xfs_itrace_entry(target_dp); - - if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) || - DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME, - src_dp, DM_RIGHT_NULL, - target_dp, DM_RIGHT_NULL, - src_name->name, target_name->name, - 0, 0, 0); - if (error) - return error; - } - /* Return through std_return after this point. */ + trace_xfs_rename(src_dp, target_dp, src_name, target_name); new_parent = (src_dp != target_dp); src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); @@ -184,26 +168,14 @@ xfs_rename( /* * Join all the inodes to the transaction. From this point on, * we can rely on either trans_commit or trans_cancel to unlock - * them. Note that we need to add a vnode reference to the - * directories since trans_commit & trans_cancel will decrement - * them when they unlock the inodes. Also, we need to be careful - * not to add an inode to the transaction more than once. + * them. */ - IHOLD(src_dp); - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); - - if (new_parent) { - IHOLD(target_dp); - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); - } - - IHOLD(src_ip); - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); - - if (target_ip) { - IHOLD(target_ip); - xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); - } + xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames @@ -369,26 +341,13 @@ xfs_rename( * trans_commit will unlock src_ip, target_ip & decrement * the vnode references. */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit */ -std_return: - if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) || - DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) { - (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME, - src_dp, DM_RIGHT_NULL, - target_dp, DM_RIGHT_NULL, - src_name->name, target_name->name, - 0, error, 0); - } - return error; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); abort_return: cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ error_return: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, cancel_flags); - goto std_return; + std_return: + return error; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a2d32ce..891260f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -25,17 +25,10 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -129,7 +122,7 @@ xfs_growfs_rt_alloc( cancelflags |= XFS_TRANS_ABORT; error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist, NULL); + resblks, &map, &nmap, &flist); if (!error && nmap < 1) error = XFS_ERROR(ENOSPC); if (error) diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index e336742..56861d5 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -24,27 +24,12 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" -#include "xfs_attr.h" -#include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_buf_item.h" #include "xfs_rw.h" -#include "xfs_trace.h" /* * Force a shutdown of the filesystem instantly while keeping diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 28547df..fdca741 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -24,16 +25,12 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -47,6 +44,7 @@ #include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; +kmem_zone_t *xfs_log_item_desc_zone; /* @@ -597,8 +595,7 @@ _xfs_trans_alloc( tp->t_magic = XFS_TRANS_MAGIC; tp->t_type = type; tp->t_mountp = mp; - tp->t_items_free = XFS_LIC_NUM_SLOTS; - xfs_lic_init(&(tp->t_items)); + INIT_LIST_HEAD(&tp->t_items); INIT_LIST_HEAD(&tp->t_busy); return tp; } @@ -643,8 +640,7 @@ xfs_trans_dup( ntp->t_magic = XFS_TRANS_MAGIC; ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; - ntp->t_items_free = XFS_LIC_NUM_SLOTS; - xfs_lic_init(&(ntp->t_items)); + INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -1124,6 +1120,108 @@ xfs_trans_unreserve_and_mod_sb( } /* + * Add the given log item to the transaction's list of log items. + * + * The log item will now point to its new descriptor with its li_desc field. + */ +void +xfs_trans_add_item( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_log_item_desc *lidp; + + ASSERT(lip->li_mountp = tp->t_mountp); + ASSERT(lip->li_ailp = tp->t_mountp->m_ail); + + lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); + + lidp->lid_item = lip; + lidp->lid_flags = 0; + lidp->lid_size = 0; + list_add_tail(&lidp->lid_trans, &tp->t_items); + + lip->li_desc = lidp; +} + +STATIC void +xfs_trans_free_item_desc( + struct xfs_log_item_desc *lidp) +{ + list_del_init(&lidp->lid_trans); + kmem_zone_free(xfs_log_item_desc_zone, lidp); +} + +/* + * Unlink and free the given descriptor. + */ +void +xfs_trans_del_item( + struct xfs_log_item *lip) +{ + xfs_trans_free_item_desc(lip->li_desc); + lip->li_desc = NULL; +} + +/* + * Unlock all of the items of a transaction and free all the descriptors + * of that transaction. + */ +STATIC void +xfs_trans_free_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn, + int flags) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + IOP_COMMITTING(lip, commit_lsn); + if (flags & XFS_TRANS_ABORT) + lip->li_flags |= XFS_LI_ABORTED; + IOP_UNLOCK(lip); + + xfs_trans_free_item_desc(lidp); + } +} + +/* + * Unlock the items associated with a transaction. + * + * Items which were not logged should be freed. Those which were logged must + * still be tracked so they can be unpinned when the transaction commits. + */ +STATIC void +xfs_trans_unlock_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + IOP_COMMITTING(lip, commit_lsn); + IOP_UNLOCK(lip); + + /* + * Free the descriptor if the item is not dirty + * within this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + xfs_trans_free_item_desc(lidp); + } +} + +/* * Total up the number of log iovecs needed to commit this * transaction. The transaction itself needs one for the * transaction header. Ask each dirty item in turn how many @@ -1134,30 +1232,27 @@ xfs_trans_count_vecs( struct xfs_trans *tp) { int nvecs; - xfs_log_item_desc_t *lidp; + struct xfs_log_item_desc *lidp; nvecs = 1; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); /* In the non-debug case we need to start bailing out if we * didn't find a log_item here, return zero and let trans_commit * deal with it. */ - if (lidp == NULL) + if (list_empty(&tp->t_items)) { + ASSERT(0); return 0; + } - while (lidp != NULL) { + list_for_each_entry(lidp, &tp->t_items, lid_trans) { /* * Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); + if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - } lidp->lid_size = IOP_SIZE(lidp->lid_item); nvecs += lidp->lid_size; - lidp = xfs_trans_next_item(tp, lidp); } return nvecs; @@ -1177,7 +1272,7 @@ xfs_trans_fill_vecs( struct xfs_trans *tp, struct xfs_log_iovec *log_vector) { - xfs_log_item_desc_t *lidp; + struct xfs_log_item_desc *lidp; struct xfs_log_iovec *vecp; uint nitems; @@ -1188,14 +1283,11 @@ xfs_trans_fill_vecs( vecp = log_vector + 1; nitems = 0; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp); - while (lidp) { + ASSERT(!list_empty(&tp->t_items)); + list_for_each_entry(lidp, &tp->t_items, lid_trans) { /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); + if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - } /* * The item may be marked dirty but not log anything. This can @@ -1206,7 +1298,6 @@ xfs_trans_fill_vecs( IOP_FORMAT(lidp->lid_item, vecp); vecp += lidp->lid_size; IOP_PIN(lidp->lid_item); - lidp = xfs_trans_next_item(tp, lidp); } /* @@ -1284,7 +1375,7 @@ xfs_trans_item_committed( * log item flags, if anyone else stales the buffer we do not want to * pay any attention to it. */ - IOP_UNPIN(lip); + IOP_UNPIN(lip, 0); } /* @@ -1301,24 +1392,15 @@ xfs_trans_committed( struct xfs_trans *tp, int abortflag) { - xfs_log_item_desc_t *lidp; - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; + struct xfs_log_item_desc *lidp, *next; /* Call the transaction's completion callback if there is one. */ if (tp->t_callback != NULL) tp->t_callback(tp, tp->t_callarg); - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); - } - - /* free the item chunks, ignoring the embedded chunk */ - for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) { - next_licp = licp->lic_next; - kmem_free(licp); + xfs_trans_free_item_desc(lidp); } xfs_trans_free(tp); @@ -1333,16 +1415,14 @@ xfs_trans_uncommit( struct xfs_trans *tp, uint flags) { - xfs_log_item_desc_t *lidp; + struct xfs_log_item_desc *lidp; - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { + list_for_each_entry(lidp, &tp->t_items, lid_trans) { /* * Unpin all but those that aren't dirty. */ if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN_REMOVE(lidp->lid_item, tp); + IOP_UNPIN(lidp->lid_item, 1); } xfs_trans_unreserve_and_mod_sb(tp); @@ -1508,33 +1588,28 @@ STATIC struct xfs_log_vec * xfs_trans_alloc_log_vecs( xfs_trans_t *tp) { - xfs_log_item_desc_t *lidp; + struct xfs_log_item_desc *lidp; struct xfs_log_vec *lv = NULL; struct xfs_log_vec *ret_lv = NULL; - lidp = xfs_trans_first_item(tp); /* Bail out if we didn't find a log item. */ - if (!lidp) { + if (list_empty(&tp->t_items)) { ASSERT(0); return NULL; } - while (lidp != NULL) { + list_for_each_entry(lidp, &tp->t_items, lid_trans) { struct xfs_log_vec *new_lv; /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); + if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - } /* Skip items that do not have any vectors for writing */ lidp->lid_size = IOP_SIZE(lidp->lid_item); - if (!lidp->lid_size) { - lidp = xfs_trans_next_item(tp, lidp); + if (!lidp->lid_size) continue; - } new_lv = kmem_zalloc(sizeof(*new_lv) + lidp->lid_size * sizeof(struct xfs_log_iovec), @@ -1549,7 +1624,6 @@ xfs_trans_alloc_log_vecs( else lv->lv_next = new_lv; lv = new_lv; - lidp = xfs_trans_next_item(tp, lidp); } return ret_lv; @@ -1708,12 +1782,6 @@ xfs_trans_cancel( int flags) { int log_flags; -#ifdef DEBUG - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - int i; -#endif xfs_mount_t *mp = tp->t_mountp; /* @@ -1732,21 +1800,11 @@ xfs_trans_cancel( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!(flags & XFS_TRANS_ABORT)) { - licp = &(tp->t_items); - while (licp != NULL) { - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lip = lidp->lid_item; - if (!XFS_FORCED_SHUTDOWN(mp)) - ASSERT(!(lip->li_type == XFS_LI_EFD)); - } - licp = licp->lic_next; - } + if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) + ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); } #endif xfs_trans_unreserve_and_mod_sb(tp); @@ -1834,7 +1892,6 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(trans, dp); + xfs_trans_ijoin(trans, dp); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e639e8e..c13c0f9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -161,105 +161,14 @@ typedef struct xfs_trans_header { * the amount of space needed to log the item it describes * once we get to commit processing (see xfs_trans_commit()). */ -typedef struct xfs_log_item_desc { +struct xfs_log_item_desc { struct xfs_log_item *lid_item; - ushort lid_size; - unsigned char lid_flags; - unsigned char lid_index; -} xfs_log_item_desc_t; + ushort lid_size; + unsigned char lid_flags; + struct list_head lid_trans; +}; #define XFS_LID_DIRTY 0x1 -#define XFS_LID_PINNED 0x2 - -/* - * This structure is used to maintain a chunk list of log_item_desc - * structures. The free field is a bitmask indicating which descriptors - * in this chunk's array are free. The unused field is the first value - * not used since this chunk was allocated. - */ -#define XFS_LIC_NUM_SLOTS 15 -typedef struct xfs_log_item_chunk { - struct xfs_log_item_chunk *lic_next; - ushort lic_free; - ushort lic_unused; - xfs_log_item_desc_t lic_descs[XFS_LIC_NUM_SLOTS]; -} xfs_log_item_chunk_t; - -#define XFS_LIC_MAX_SLOT (XFS_LIC_NUM_SLOTS - 1) -#define XFS_LIC_FREEMASK ((1 << XFS_LIC_NUM_SLOTS) - 1) - - -/* - * Initialize the given chunk. Set the chunk's free descriptor mask - * to indicate that all descriptors are free. The caller gets to set - * lic_unused to the right value (0 matches all free). The - * lic_descs.lid_index values are set up as each desc is allocated. - */ -static inline void xfs_lic_init(xfs_log_item_chunk_t *cp) -{ - cp->lic_free = XFS_LIC_FREEMASK; -} - -static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_descs[slot].lid_index = (unsigned char)(slot); -} - -static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp) -{ - return cp->lic_free & XFS_LIC_FREEMASK; -} - -static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp) -{ - cp->lic_free = XFS_LIC_FREEMASK; -} - -static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp) -{ - return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK); -} - -static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot) -{ - return (cp->lic_free & (1 << slot)); -} - -static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_free &= ~(1 << slot); -} - -static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_free |= 1 << slot; -} - -static inline xfs_log_item_desc_t * -xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot) -{ - return &(cp->lic_descs[slot]); -} - -static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp) -{ - return (uint)dp->lid_index; -} - -/* - * Calculate the address of a chunk given a descriptor pointer: - * dp - dp->lid_index give the address of the start of the lic_descs array. - * From this we subtract the offset of the lic_descs field in a chunk. - * All of this yields the address of the chunk, which is - * cast to a chunk pointer. - */ -static inline xfs_log_item_chunk_t * -xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) -{ - return (xfs_log_item_chunk_t*) \ - (((xfs_caddr_t)((dp) - (dp)->lid_index)) - \ - (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs)); -} #define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ /* @@ -275,8 +184,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) /* * Values for call flags parameter. */ -#define XFS_TRANS_NOSLEEP 0x1 -#define XFS_TRANS_WAIT 0x2 #define XFS_TRANS_RELEASE_LOG_RES 0x4 #define XFS_TRANS_ABORT 0x8 @@ -438,8 +345,7 @@ typedef struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *); - void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); + void (*iop_unpin)(xfs_log_item_t *, int remove); uint (*iop_trylock)(xfs_log_item_t *); void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); @@ -451,8 +357,7 @@ typedef struct xfs_item_ops { #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip) -#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) +#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) #define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) #define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) @@ -516,8 +421,7 @@ typedef struct xfs_trans { int64_t t_rblocks_delta;/* superblock rblocks change */ int64_t t_rextents_delta;/* superblocks rextents chg */ int64_t t_rextslog_delta;/* superblocks rextslog chg */ - unsigned int t_items_free; /* log item descs free */ - xfs_log_item_chunk_t t_items; /* first log item desc chunk */ + struct list_head t_items; /* log item descriptors */ xfs_trans_header_t t_header; /* header for in-log trans */ struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ @@ -569,8 +473,8 @@ void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, xfs_ino_t , uint, uint, struct xfs_inode **); -void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint); -void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *); +void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); @@ -595,6 +499,7 @@ int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); extern kmem_zone_t *xfs_trans_zone; +extern kmem_zone_t *xfs_log_item_desc_zone; #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index e799824..dc90695 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -24,7 +24,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 63d81a2..90af025 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -24,14 +24,10 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_buf_item.h" @@ -51,36 +47,17 @@ xfs_trans_buf_item_match( xfs_daddr_t blkno, int len) { - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - int i; + struct xfs_log_item_desc *lidp; + struct xfs_buf_log_item *blip; len = BBTOB(len); - for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (xfs_lic_are_all_free(licp)) { - ASSERT(licp == &tp->t_items); - ASSERT(licp->lic_next == NULL); - return NULL; - } - - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) - continue; - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) - continue; - - if (XFS_BUF_TARGET(blip->bli_buf) == target && - XFS_BUF_ADDR(blip->bli_buf) == blkno && - XFS_BUF_COUNT(blip->bli_buf) == len) - return blip->bli_buf; - } + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + blip = (struct xfs_buf_log_item *)lidp->lid_item; + if (blip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_TARGET(blip->bli_buf) == target && + XFS_BUF_ADDR(blip->bli_buf) == blkno && + XFS_BUF_COUNT(blip->bli_buf) == len) + return blip->bli_buf; } return NULL; @@ -127,7 +104,7 @@ _xfs_trans_bjoin( /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); + xfs_trans_add_item(tp, &bip->bli_item); /* * Initialize b_fsprivate2 so we can find it with incore_match() @@ -483,7 +460,6 @@ xfs_trans_brelse(xfs_trans_t *tp, { xfs_buf_log_item_t *bip; xfs_log_item_t *lip; - xfs_log_item_desc_t *lidp; /* * Default to a normal brelse() call if the tp is NULL. @@ -514,13 +490,6 @@ xfs_trans_brelse(xfs_trans_t *tp, ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); - /* - * Find the item descriptor pointing to this buffer's - * log item. It must be there. - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); - trace_xfs_trans_brelse(bip); /* @@ -536,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t *tp, * If the buffer is dirty within this transaction, we can't * release it until we commit. */ - if (lidp->lid_flags & XFS_LID_DIRTY) + if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) return; /* @@ -553,7 +522,7 @@ xfs_trans_brelse(xfs_trans_t *tp, /* * Free up the log item descriptor tracking the released item. */ - xfs_trans_free_item(tp, lidp); + xfs_trans_del_item(&bip->bli_item); /* * Clear the hold flag in the buf log item if it is set. @@ -665,7 +634,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, uint last) { xfs_buf_log_item_t *bip; - xfs_log_item_desc_t *lidp; ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); @@ -690,7 +658,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); - bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone; + bip->bli_item.li_cb = xfs_buf_iodone; trace_xfs_trans_log_buf(bip); @@ -707,11 +675,8 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; } - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; bip->bli_flags |= XFS_BLI_LOGGED; xfs_buf_item_log(bip, first, last); } @@ -740,7 +705,6 @@ xfs_trans_binval( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_log_item_desc_t *lidp; xfs_buf_log_item_t *bip; ASSERT(XFS_BUF_ISBUSY(bp)); @@ -748,8 +712,6 @@ xfs_trans_binval( ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_binval(bip); @@ -764,7 +726,7 @@ xfs_trans_binval( ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); - ASSERT(lidp->lid_flags & XFS_LID_DIRTY); + ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; } @@ -797,7 +759,7 @@ xfs_trans_binval( bip->bli_format.blf_flags |= XFS_BLF_CANCEL; memset((char *)(bip->bli_format.blf_data_map), 0, (bip->bli_format.blf_map_size * sizeof(uint))); - lidp->lid_flags |= XFS_LID_DIRTY; + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } @@ -853,12 +815,9 @@ xfs_trans_stale_inode_buf( ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; - bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_buf_iodone; + bip->bli_item.li_cb = xfs_buf_iodone; } - - /* * Mark the buffer as being one which contains newly allocated * inodes. We need to make sure that even if this buffer is diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 27cce2a..f783d5e 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -23,7 +23,6 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" @@ -49,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t *tp, /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip); - - return (efip); + xfs_trans_add_item(tp, &efip->efi_item); + return efip; } /* @@ -65,15 +63,11 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp, xfs_fsblock_t start_block, xfs_extlen_t ext_len) { - xfs_log_item_desc_t *lidp; uint next_extent; xfs_extent_t *extp; - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; next_extent = efip->efi_next_extent; ASSERT(next_extent < efip->efi_format.efi_nextents); @@ -106,9 +100,8 @@ xfs_trans_get_efd(xfs_trans_t *tp, /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp); - - return (efdp); + xfs_trans_add_item(tp, &efdp->efd_item); + return efdp; } /* @@ -122,15 +115,11 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp, xfs_fsblock_t start_block, xfs_extlen_t ext_len) { - xfs_log_item_desc_t *lidp; uint next_extent; xfs_extent_t *extp; - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 2559dfe..cdc53a1 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -24,20 +24,16 @@ #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" +#include "xfs_trace.h" #ifdef XFS_TRANS_DEBUG STATIC void @@ -47,7 +43,6 @@ xfs_trans_inode_broot_debug( #define xfs_trans_inode_broot_debug(ip) #endif - /* * Get an inode and join it to the transaction. */ @@ -63,77 +58,65 @@ xfs_trans_iget( int error; error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp); - if (!error && tp) - xfs_trans_ijoin(tp, *ipp, lock_flags); + if (!error && tp) { + xfs_trans_ijoin(tp, *ipp); + (*ipp)->i_itemp->ili_lock_flags = lock_flags; + } return error; } /* - * Add the locked inode to the transaction. - * The inode must be locked, and it cannot be associated with any - * transaction. The caller must specify the locks already held - * on the inode. + * Add a locked inode to the transaction. + * + * The inode must be locked, and it cannot be associated with any transaction. */ void xfs_trans_ijoin( - xfs_trans_t *tp, - xfs_inode_t *ip, - uint lock_flags) + struct xfs_trans *tp, + struct xfs_inode *ip) { xfs_inode_log_item_t *iip; ASSERT(ip->i_transp == NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(lock_flags & XFS_ILOCK_EXCL); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; - ASSERT(iip->ili_flags == 0); + ASSERT(iip->ili_lock_flags == 0); /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip)); + xfs_trans_add_item(tp, &iip->ili_item); xfs_trans_inode_broot_debug(ip); /* - * If the IO lock is already held, mark that in the inode log item. - */ - if (lock_flags & XFS_IOLOCK_EXCL) { - iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED; - } - - /* * Initialize i_transp so we can find it with xfs_inode_incore() * in xfs_trans_iget() above. */ ip->i_transp = tp; } - - /* - * Mark the inode as not needing to be unlocked when the inode item's - * IOP_UNLOCK() routine is called. The inode must already be locked - * and associated with the given transaction. + * Add a locked inode to the transaction. + * + * + * Grabs a reference to the inode which will be dropped when the transaction + * is commited. The inode will also be unlocked at that point. The inode + * must be locked, and it cannot be associated with any transaction. */ -/*ARGSUSED*/ void -xfs_trans_ihold( - xfs_trans_t *tp, - xfs_inode_t *ip) +xfs_trans_ijoin_ref( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint lock_flags) { - ASSERT(ip->i_transp == tp); - ASSERT(ip->i_itemp != NULL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - ip->i_itemp->ili_flags |= XFS_ILI_HOLD; + xfs_trans_ijoin(tp, ip); + IHOLD(ip); + ip->i_itemp->ili_lock_flags = lock_flags; } - /* * This is called to mark the fields indicated in fieldmask as needing * to be logged when the transaction is committed. The inode must @@ -149,17 +132,12 @@ xfs_trans_log_inode( xfs_inode_t *ip, uint flags) { - xfs_log_item_desc_t *lidp; - ASSERT(ip->i_transp == tp); ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp)); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; /* * Always OR in the bits from the ili_last_fields field. diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c deleted file mode 100644 index f11d37d..0000000 --- a/fs/xfs/xfs_trans_item.c +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -/* XXX: from here down needed until struct xfs_trans has its own ailp */ -#include "xfs_bit.h" -#include "xfs_buf_item.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" - -STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *, - int, int, xfs_lsn_t); - -/* - * This is called to add the given log item to the transaction's - * list of log items. It must find a free log item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to item descriptor used to point - * to the new item. The log item will now point to its new descriptor - * with its li_desc field. - */ -xfs_log_item_desc_t * -xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_chunk_t *licp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_items_free == 0) { - licp = (xfs_log_item_chunk_t*) - kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP); - ASSERT(licp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - xfs_lic_init(licp); - xfs_lic_claim(licp, 0); - licp->lic_unused = 1; - xfs_lic_init_slot(licp, 0); - lidp = xfs_lic_slot(licp, 0); - - /* - * Link in the new chunk and update the free count. - */ - licp->lic_next = tp->t_items.lic_next; - tp->t_items.lic_next = licp; - tp->t_items_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lidp->lid_item = lip; - lidp->lid_flags = 0; - lidp->lid_size = 0; - lip->li_desc = lidp; - lip->li_mountp = tp->t_mountp; - lip->li_ailp = tp->t_mountp->m_ail; - return lidp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - licp = &tp->t_items; - while (licp != NULL) { - if (xfs_lic_vacancy(licp)) { - if (licp->lic_unused <= XFS_LIC_MAX_SLOT) { - i = licp->lic_unused; - ASSERT(xfs_lic_isfree(licp, i)); - break; - } - for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) { - if (xfs_lic_isfree(licp, i)) - break; - } - ASSERT(i <= XFS_LIC_MAX_SLOT); - break; - } - licp = licp->lic_next; - } - ASSERT(licp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - xfs_lic_claim(licp, i); - if (licp->lic_unused <= i) { - licp->lic_unused = i + 1; - xfs_lic_init_slot(licp, i); - } - lidp = xfs_lic_slot(licp, i); - tp->t_items_free--; - lidp->lid_item = lip; - lidp->lid_flags = 0; - lidp->lid_size = 0; - lip->li_desc = lidp; - lip->li_mountp = tp->t_mountp; - lip->li_ailp = tp->t_mountp->m_ail; - return lidp; -} - -/* - * Free the given descriptor. - * - * This requires setting the bit in the chunk's free mask corresponding - * to the given slot. - */ -void -xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) -{ - uint slot; - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t **licpp; - - slot = xfs_lic_desc_to_slot(lidp); - licp = xfs_lic_desc_to_chunk(lidp); - xfs_lic_relse(licp, slot); - lidp->lid_item->li_desc = NULL; - tp->t_items_free++; - - /* - * If there are no more used items in the chunk and this is not - * the chunk embedded in the transaction structure, then free - * the chunk. First pull it from the chunk list and then - * free it back to the heap. We didn't bother with a doubly - * linked list here because the lists should be very short - * and this is not a performance path. It's better to save - * the memory of the extra pointer. - * - * Also decrement the transaction structure's count of free items - * by the number in a chunk since we are freeing an empty chunk. - */ - if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) { - licpp = &(tp->t_items.lic_next); - while (*licpp != licp) { - ASSERT(*licpp != NULL); - licpp = &((*licpp)->lic_next); - } - *licpp = licp->lic_next; - kmem_free(licp); - tp->t_items_free -= XFS_LIC_NUM_SLOTS; - } -} - -/* - * This is called to find the descriptor corresponding to the given - * log item. It returns a pointer to the descriptor. - * The log item MUST have a corresponding descriptor in the given - * transaction. This routine does not return NULL, it panics. - * - * The descriptor pointer is kept in the log item's li_desc field. - * Just return it. - */ -/*ARGSUSED*/ -xfs_log_item_desc_t * -xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip) -{ - ASSERT(lip->li_desc != NULL); - - return lip->li_desc; -} - - -/* - * Return a pointer to the first descriptor in the chunk list. - * This does not return NULL if there are none, it panics. - * - * The first descriptor must be in either the first or second chunk. - * This is because the only chunk allowed to be empty is the first. - * All others are freed when they become empty. - * - * At some point this and xfs_trans_next_item() should be optimized - * to quickly look at the mask to determine if there is anything to - * look at. - */ -xfs_log_item_desc_t * -xfs_trans_first_item(xfs_trans_t *tp) -{ - xfs_log_item_chunk_t *licp; - int i; - - licp = &tp->t_items; - /* - * If it's not in the first chunk, skip to the second. - */ - if (xfs_lic_are_all_free(licp)) { - licp = licp->lic_next; - } - - /* - * Return the first non-free descriptor in the chunk. - */ - ASSERT(!xfs_lic_are_all_free(licp)); - for (i = 0; i < licp->lic_unused; i++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - - return xfs_lic_slot(licp, i); - } - cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item"); - return NULL; -} - - -/* - * Given a descriptor, return the next descriptor in the chunk list. - * This returns NULL if there are no more used descriptors in the list. - * - * We do this by first locating the chunk in which the descriptor resides, - * and then scanning forward in the chunk and the list for the next - * used descriptor. - */ -/*ARGSUSED*/ -xfs_log_item_desc_t * -xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) -{ - xfs_log_item_chunk_t *licp; - int i; - - licp = xfs_lic_desc_to_chunk(lidp); - - /* - * First search the rest of the chunk. The for loop keeps us - * from referencing things beyond the end of the chunk. - */ - for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - - return xfs_lic_slot(licp, i); - } - - /* - * Now search the next chunk. It must be there, because the - * next chunk would have been freed if it were empty. - * If there is no next chunk, return NULL. - */ - if (licp->lic_next == NULL) { - return NULL; - } - - licp = licp->lic_next; - ASSERT(!xfs_lic_are_all_free(licp)); - for (i = 0; i < licp->lic_unused; i++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - - return xfs_lic_slot(licp, i); - } - ASSERT(0); - /* NOTREACHED */ - return NULL; /* keep gcc quite */ -} - -/* - * This is called to unlock all of the items of a transaction and to free - * all the descriptors of that transaction. - * - * It walks the list of descriptors and unlocks each item. It frees - * each chunk except that embedded in the transaction as it goes along. - */ -void -xfs_trans_free_items( - xfs_trans_t *tp, - xfs_lsn_t commit_lsn, - int flags) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - int abort; - - abort = flags & XFS_TRANS_ABORT; - licp = &tp->t_items; - /* - * Special case the embedded chunk so we don't free it below. - */ - if (!xfs_lic_are_all_free(licp)) { - (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); - xfs_lic_all_free(licp); - licp->lic_unused = 0; - } - licp = licp->lic_next; - - /* - * Unlock each item in each chunk and free the chunks. - */ - while (licp != NULL) { - ASSERT(!xfs_lic_are_all_free(licp)); - (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); - next_licp = licp->lic_next; - kmem_free(licp); - licp = next_licp; - } - - /* - * Reset the transaction structure's free item count. - */ - tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_items.lic_next = NULL; -} - - - -/* - * This is called to unlock the items associated with a transaction. - * Items which were not logged should be freed. - * Those which were logged must still be tracked so they can be unpinned - * when the transaction commits. - */ -void -xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - xfs_log_item_chunk_t **licpp; - int freed; - - freed = 0; - licp = &tp->t_items; - - /* - * Special case the embedded chunk so we don't free. - */ - if (!xfs_lic_are_all_free(licp)) { - freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); - } - licpp = &(tp->t_items.lic_next); - licp = licp->lic_next; - - /* - * Unlock each item in each chunk, free non-dirty descriptors, - * and free empty chunks. - */ - while (licp != NULL) { - ASSERT(!xfs_lic_are_all_free(licp)); - freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); - next_licp = licp->lic_next; - if (xfs_lic_are_all_free(licp)) { - *licpp = next_licp; - kmem_free(licp); - freed -= XFS_LIC_NUM_SLOTS; - } else { - licpp = &(licp->lic_next); - } - ASSERT(*licpp == next_licp); - licp = next_licp; - } - - /* - * Fix the free descriptor count in the transaction. - */ - tp->t_items_free += freed; -} - -/* - * Unlock each item pointed to by a descriptor in the given chunk. - * Stamp the commit lsn into each item if necessary. - * Free descriptors pointing to items which are not dirty if freeing_chunk - * is zero. If freeing_chunk is non-zero, then we need to unlock all - * items in the chunk. - * - * Return the number of descriptors freed. - */ -STATIC int -xfs_trans_unlock_chunk( - xfs_log_item_chunk_t *licp, - int freeing_chunk, - int abort, - xfs_lsn_t commit_lsn) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - int i; - int freed; - - freed = 0; - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (xfs_lic_isfree(licp, i)) { - continue; - } - lip = lidp->lid_item; - lip->li_desc = NULL; - - if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); - if (abort) - lip->li_flags |= XFS_LI_ABORTED; - IOP_UNLOCK(lip); - - /* - * Free the descriptor if the item is not dirty - * within this transaction and the caller is not - * going to just free the entire thing regardless. - */ - if (!(freeing_chunk) && - (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) { - xfs_lic_relse(licp, i); - freed++; - } - } - - return freed; -} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index c6e4f2c8..e2d93d8 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -23,22 +23,8 @@ struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; -/* - * From xfs_trans_item.c - */ -struct xfs_log_item_desc *xfs_trans_add_item(struct xfs_trans *, - struct xfs_log_item *); -void xfs_trans_free_item(struct xfs_trans *, - struct xfs_log_item_desc *); -struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, - struct xfs_log_item *); -struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); -struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, - struct xfs_log_item_desc *); - -void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn); -void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_item_committed(struct xfs_log_item *lip, xfs_lsn_t commit_lsn, int aborted); diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 4d88616..b7d5769 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -25,18 +25,14 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" -#include "xfs_rw.h" #include "xfs_itable.h" #include "xfs_utils.h" @@ -324,86 +320,3 @@ xfs_bumplink( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return 0; } - -/* - * Try to truncate the given file to 0 length. Currently called - * only out of xfs_remove when it has to truncate a file to free - * up space for the remove to proceed. - */ -int -xfs_truncate_file( - xfs_mount_t *mp, - xfs_inode_t *ip) -{ - xfs_trans_t *tp; - int error; - -#ifdef QUOTADEBUG - /* - * This is called to truncate the quotainodes too. - */ - if (XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_ino != mp->m_sb.sb_uquotino) - ASSERT(ip->i_udquot); - } - if (XFS_IS_OQUOTA_ON(mp)) { - if (ip->i_ino != mp->m_sb.sb_gquotino) - ASSERT(ip->i_gdquot); - } -#endif - /* - * Make the call to xfs_itruncate_start before starting the - * transaction, because we cannot make the call while we're - * in a transaction. - */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - /* - * Follow the normal truncate locking protocol. Since we - * hold the inode in the transaction, we know that its number - * of references will stay constant. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); - /* - * Signal a sync xaction. The only case where that isn't - * the case is if we're truncating an already unlinked file - * on a wsync fs. In that case, we know the blocks can't - * reappear in the file because the links to file are - * permanently toast. Currently, we're always going to - * want a sync transaction because this code is being - * called from places where nlink is guaranteed to be 1 - * but I'm leaving the tests in to protect against future - * changes -- rcc. - */ - error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0, - XFS_DATA_FORK, - ((ip->i_d.di_nlink != 0 || - !(mp->m_flags & XFS_MOUNT_WSYNC)) - ? 1 : 0)); - if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); - } else { - xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - } - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - return error; -} diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index ef32122..f55b967 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -18,7 +18,6 @@ #ifndef __XFS_UTILS_H__ #define __XFS_UTILS_H__ -extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *); extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, cred_t *, prid_t, int, xfs_inode_t **, int *); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c164683..3ac137d 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -26,19 +26,14 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_itable.h" -#include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_bmap.h" @@ -73,7 +68,7 @@ xfs_setattr( struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; int need_iolock = 1; - xfs_itrace_entry(ip); + trace_xfs_setattr(ip); if (mp->m_flags & XFS_MOUNT_RDONLY) return XFS_ERROR(EROFS); @@ -143,16 +138,6 @@ xfs_setattr( goto error_return; } } else { - if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && - !(flags & XFS_ATTR_DMI)) { - int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR; - code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip, - iattr->ia_size, 0, dmflags, NULL); - if (code) { - lock_flags = 0; - goto error_return; - } - } if (need_iolock) lock_flags |= XFS_IOLOCK_EXCL; } @@ -283,8 +268,7 @@ xfs_setattr( commit_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * Only change the c/mtime if we are changing the size @@ -334,8 +318,7 @@ xfs_setattr( xfs_iflags_set(ip, XFS_ITRUNCATED); } } else if (tp) { - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); } /* @@ -470,17 +453,10 @@ xfs_setattr( return XFS_ERROR(code); } - if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) && - !(flags & XFS_ATTR_DMI)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, NULL, NULL, - 0, 0, AT_DELAY_FLAG(flags)); - } return 0; abort_return: commit_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ error_return: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -516,7 +492,7 @@ xfs_readlink_bmap( int error = 0; error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0, - mval, &nmaps, NULL, NULL); + mval, &nmaps, NULL); if (error) goto out; @@ -557,7 +533,7 @@ xfs_readlink( int pathlen; int error = 0; - xfs_itrace_entry(ip); + trace_xfs_readlink(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -613,14 +589,14 @@ xfs_free_eofblocks( */ end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); - map_len = last_fsb - end_fsb; - if (map_len <= 0) + if (last_fsb <= end_fsb) return 0; + map_len = last_fsb - end_fsb; nimaps = 1; xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0, - NULL, 0, &imap, &nimaps, NULL, NULL); + NULL, 0, &imap, &nimaps, NULL); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!error && (nimaps != 0) && @@ -675,10 +651,7 @@ xfs_free_eofblocks( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, - XFS_IOLOCK_EXCL | - XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); error = xfs_itruncate_finish(&tp, ip, ip->i_size, @@ -750,8 +723,7 @@ xfs_inactive_symlink_rmt( xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -761,7 +733,7 @@ xfs_inactive_symlink_rmt( nmaps = ARRAY_SIZE(mval); if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, - &free_list, NULL))) + &free_list))) goto error0; /* * Invalidate the block(s). @@ -776,7 +748,7 @@ xfs_inactive_symlink_rmt( * Unmap the dead block(s) to the free_list. */ if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, - &first_block, &free_list, NULL, &done))) + &first_block, &free_list, &done))) goto error1; ASSERT(done); /* @@ -795,8 +767,7 @@ xfs_inactive_symlink_rmt( * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Get a new, empty transaction to return to our caller. @@ -929,8 +900,7 @@ xfs_inactive_attrs( goto error_cancel; xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); xfs_idestroy_fork(ip, XFS_ATTR_FORK); ASSERT(ip->i_d.di_anextents == 0); @@ -1035,8 +1005,6 @@ xfs_inactive( int error; int truncate; - xfs_itrace_entry(ip); - /* * If the inode is already free, then there can be nothing * to clean up here. @@ -1060,9 +1028,6 @@ xfs_inactive( mp = ip->i_mount; - if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) - XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL); - error = 0; /* If this is a read-only mount, don't do this (would generate I/O) */ @@ -1120,8 +1085,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * normally, we have to run xfs_itruncate_finish sync. @@ -1154,8 +1118,7 @@ xfs_inactive( return VN_INACTIVE_CACHE; } - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); } else { error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), @@ -1168,8 +1131,7 @@ xfs_inactive( } xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); } /* @@ -1257,7 +1219,7 @@ xfs_lookup( int error; uint lock_mode; - xfs_itrace_entry(dp); + trace_xfs_lookup(dp, name); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); @@ -1309,21 +1271,11 @@ xfs_create( uint log_res; uint log_count; - xfs_itrace_entry(dp); + trace_xfs_create(dp, name); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, name->name, NULL, - mode, 0, 0); - - if (error) - return error; - } - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; else @@ -1427,8 +1379,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - IHOLD(dp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1487,16 +1438,7 @@ xfs_create( xfs_qm_dqrele(gdqp); *ipp = ip; - - /* Fallthrough to std_return with error = 0 */ - std_return: - if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { - XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL, - ip, DM_RIGHT_NULL, name->name, NULL, mode, - error, 0); - } - - return error; + return 0; out_bmap_cancel: xfs_bmap_cancel(&free_list); @@ -1510,8 +1452,8 @@ xfs_create( if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - - goto std_return; + std_return: + return error; out_abort_rele: /* @@ -1726,20 +1668,11 @@ xfs_remove( uint resblks; uint log_count; - xfs_itrace_entry(dp); - xfs_itrace_entry(ip); + trace_xfs_remove(dp, name); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, name->name, NULL, - ip->i_d.di_mode, 0, 0); - if (error) - return error; - } - error = xfs_qm_dqattach(dp, 0); if (error) goto std_return; @@ -1782,15 +1715,8 @@ xfs_remove( xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - /* - * At this point, we've gotten both the directory and the entry - * inodes locked. - */ - IHOLD(ip); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - - IHOLD(dp); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. @@ -1877,21 +1803,15 @@ xfs_remove( if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) xfs_filestream_deassociate(ip); - std_return: - if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { - XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, name->name, NULL, - ip->i_d.di_mode, error, 0); - } - - return error; + return 0; out_bmap_cancel: xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); - goto std_return; + std_return: + return error; } int @@ -1909,25 +1829,13 @@ xfs_link( int committed; int resblks; - xfs_itrace_entry(tdp); - xfs_itrace_entry(sip); + trace_xfs_link(tdp, target_name); ASSERT(!S_ISDIR(sip->i_d.di_mode)); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK, - tdp, DM_RIGHT_NULL, - sip, DM_RIGHT_NULL, - target_name->name, NULL, 0, 0, 0); - if (error) - return error; - } - - /* Return through std_return after this point. */ - error = xfs_qm_dqattach(sip, 0); if (error) goto std_return; @@ -1953,15 +1861,8 @@ xfs_link( xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); - /* - * Increment vnode ref counts since xfs_trans_commit & - * xfs_trans_cancel will both unlock the inodes and - * decrement the associated ref counts. - */ - IHOLD(sip); - IHOLD(tdp); - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL); /* * If the source has too many links, we can't make any more to it. @@ -2014,27 +1915,14 @@ xfs_link( goto abort_return; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) - goto std_return; - - /* Fall through to std_return with error = 0. */ -std_return: - if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK, - tdp, DM_RIGHT_NULL, - sip, DM_RIGHT_NULL, - target_name->name, NULL, 0, error, 0); - } - return error; + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); abort_return: cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - error_return: xfs_trans_cancel(tp, cancel_flags); - goto std_return; + std_return: + return error; } int @@ -2074,7 +1962,7 @@ xfs_symlink( ip = NULL; tp = NULL; - xfs_itrace_entry(dp); + trace_xfs_symlink(dp, link_name); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -2086,17 +1974,6 @@ xfs_symlink( if (pathlen >= MAXPATHLEN) /* total string too long */ return XFS_ERROR(ENAMETOOLONG); - if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp, - DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - link_name->name, - (unsigned char *)target_path, 0, 0, 0); - if (error) - return error; - } - - /* Return through std_return after this point. */ - udqp = gdqp = NULL; if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; @@ -2180,8 +2057,7 @@ xfs_symlink( * transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - IHOLD(dp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; /* @@ -2215,7 +2091,7 @@ xfs_symlink( error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &first_block, resblks, mval, &nmaps, - &free_list, NULL); + &free_list); if (error) { goto error1; } @@ -2278,21 +2154,8 @@ xfs_symlink( xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit */ -std_return: - if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK, - dp, DM_RIGHT_NULL, - error ? NULL : ip, - DM_RIGHT_NULL, link_name->name, - (unsigned char *)target_path, - 0, error, 0); - } - - if (!error) - *ipp = ip; - return error; + *ipp = ip; + return 0; error2: IRELE(ip); @@ -2306,8 +2169,8 @@ std_return: if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - - goto std_return; + std_return: + return error; } int @@ -2333,13 +2196,12 @@ xfs_set_dmattrs( return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); ip->i_d.di_dmevmask = evmask; ip->i_d.di_dmstate = state; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - IHOLD(ip); error = xfs_trans_commit(tp, 0); return error; @@ -2390,7 +2252,7 @@ xfs_alloc_file_space( int committed; int error; - xfs_itrace_entry(ip); + trace_xfs_alloc_file_space(ip); if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -2412,25 +2274,9 @@ xfs_alloc_file_space( startoffset_fsb = XFS_B_TO_FSBT(mp, offset); allocatesize_fsb = XFS_B_TO_FSB(mp, count); - /* Generate a DMAPI event if needed. */ - if (alloc_type != 0 && offset < ip->i_size && - (attr_flags & XFS_ATTR_DMI) == 0 && - DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { - xfs_off_t end_dmi_offset; - - end_dmi_offset = offset+len; - if (end_dmi_offset > ip->i_size) - end_dmi_offset = ip->i_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset, - end_dmi_offset - offset, 0, NULL); - if (error) - return error; - } - /* * Allocate file space until done or until there is an error */ -retry: while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; @@ -2488,8 +2334,7 @@ retry: if (error) goto error1; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * Issue the xfs_bmapi() call to allocate the blocks @@ -2498,7 +2343,7 @@ retry: error = xfs_bmapi(tp, ip, startoffset_fsb, allocatesize_fsb, bmapi_flag, &firstfsb, 0, imapp, &nimaps, - &free_list, NULL); + &free_list); if (error) { goto error0; } @@ -2527,17 +2372,6 @@ retry: startoffset_fsb += allocated_fsb; allocatesize_fsb -= allocated_fsb; } -dmapi_enospc_check: - if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 && - DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE, - ip, DM_RIGHT_NULL, - ip, DM_RIGHT_NULL, - NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ - if (error == 0) - goto retry; /* Maybe DMAPI app. has made space */ - /* else fall through with error from XFS_SEND_DATA */ - } return error; @@ -2548,7 +2382,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto dmapi_enospc_check; + return error; } /* @@ -2598,7 +2432,7 @@ xfs_zero_remaining_bytes( offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, - NULL, 0, &imap, &nimap, NULL, NULL); + NULL, 0, &imap, &nimap, NULL); if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -2661,7 +2495,6 @@ xfs_free_file_space( { int committed; int done; - xfs_off_t end_dmi_offset; xfs_fileoff_t endoffset_fsb; int error; xfs_fsblock_t firstfsb; @@ -2680,7 +2513,7 @@ xfs_free_file_space( mp = ip->i_mount; - xfs_itrace_entry(ip); + trace_xfs_free_file_space(ip); error = xfs_qm_dqattach(ip, 0); if (error) @@ -2691,19 +2524,7 @@ xfs_free_file_space( return error; rt = XFS_IS_REALTIME_INODE(ip); startoffset_fsb = XFS_B_TO_FSB(mp, offset); - end_dmi_offset = offset + len; - endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset); - - if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 && - DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { - if (end_dmi_offset > ip->i_size) - end_dmi_offset = ip->i_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, - offset, end_dmi_offset - offset, - AT_DELAY_FLAG(attr_flags), NULL); - if (error) - return error; - } + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); if (attr_flags & XFS_ATTR_NOLOCK) need_iolock = 0; @@ -2731,7 +2552,7 @@ xfs_free_file_space( if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { nimap = 1; error = xfs_bmapi(NULL, ip, startoffset_fsb, - 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); + 1, 0, NULL, 0, &imap, &nimap, NULL); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2746,7 +2567,7 @@ xfs_free_file_space( } nimap = 1; error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, - 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); + 1, 0, NULL, 0, &imap, &nimap, NULL); if (error) goto out_unlock_iolock; ASSERT(nimap == 0 || nimap == 1); @@ -2814,8 +2635,7 @@ xfs_free_file_space( if (error) goto error1; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); /* * issue the bunmapi() call to free the blocks @@ -2823,7 +2643,7 @@ xfs_free_file_space( xfs_bmap_init(&free_list, &firstfsb); error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, NULL, &done); + 0, 2, &firstfsb, &free_list, &done); if (error) { goto error0; } @@ -2883,8 +2703,6 @@ xfs_change_file_space( xfs_trans_t *tp; struct iattr iattr; - xfs_itrace_entry(ip); - if (!S_ISREG(ip->i_d.di_mode)) return XFS_ERROR(EINVAL); @@ -2985,8 +2803,7 @@ xfs_change_file_space( xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip); if ((attr_flags & XFS_ATTR_DMI) == 0) { ip->i_d.di_mode &= ~S_ISUID; |