diff options
Diffstat (limited to 'fs')
451 files changed, 15858 insertions, 11329 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 055562c..9ff073f 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) * @offset: offset in the page */ -static void v9fs_invalidate_page(struct page *page, unsigned long offset) +static void v9fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { /* * If called with zero offset, we should release * the private state assocated with the page */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) v9fs_fscache_invalidate_page(page); } diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index be1e34a..4d0c2e0 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -101,16 +101,15 @@ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) } /** - * v9fs_dir_readdir - read a directory - * @filp: opened file structure - * @dirent: directory structure ??? - * @filldir: function to populate directory structure ??? + * v9fs_dir_readdir - iterate through a directory + * @file: opened file structure + * @ctx: actor we feed the entries to * */ -static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) { - int over; + bool over; struct p9_wstat st; int err = 0; struct p9_fid *fid; @@ -118,19 +117,19 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) int reclen = 0; struct p9_rdir *rdir; - p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); - fid = filp->private_data; + p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name); + fid = file->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; - rdir = v9fs_alloc_rdir_buf(filp, buflen); + rdir = v9fs_alloc_rdir_buf(file, buflen); if (!rdir) return -ENOMEM; while (1) { if (rdir->tail == rdir->head) { - err = v9fs_file_readn(filp, rdir->buf, NULL, - buflen, filp->f_pos); + err = v9fs_file_readn(file, rdir->buf, NULL, + buflen, ctx->pos); if (err <= 0) return err; @@ -148,51 +147,45 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) } reclen = st.size+2; - over = filldir(dirent, st.name, strlen(st.name), - filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); - + over = !dir_emit(ctx, st.name, strlen(st.name), + v9fs_qid2ino(&st.qid), dt_type(&st)); p9stat_free(&st); - if (over) return 0; rdir->head += reclen; - filp->f_pos += reclen; + ctx->pos += reclen; } } } /** - * v9fs_dir_readdir_dotl - read a directory - * @filp: opened file structure - * @dirent: buffer to fill dirent structures - * @filldir: function to populate dirent structures + * v9fs_dir_readdir_dotl - iterate through a directory + * @file: opened file structure + * @ctx: actor we feed the entries to * */ -static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, - filldir_t filldir) +static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) { - int over; int err = 0; struct p9_fid *fid; int buflen; struct p9_rdir *rdir; struct p9_dirent curdirent; - u64 oldoffset = 0; - p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); - fid = filp->private_data; + p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name); + fid = file->private_data; buflen = fid->clnt->msize - P9_READDIRHDRSZ; - rdir = v9fs_alloc_rdir_buf(filp, buflen); + rdir = v9fs_alloc_rdir_buf(file, buflen); if (!rdir) return -ENOMEM; while (1) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, - filp->f_pos); + ctx->pos); if (err <= 0) return err; @@ -210,22 +203,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, return -EIO; } - /* d_off in dirent structure tracks the offset into - * the next dirent in the dir. However, filldir() - * expects offset into the current dirent. Hence - * while calling filldir send the offset from the - * previous dirent structure. - */ - over = filldir(dirent, curdirent.d_name, - strlen(curdirent.d_name), - oldoffset, v9fs_qid2ino(&curdirent.qid), - curdirent.d_type); - oldoffset = curdirent.d_off; - - if (over) + if (!dir_emit(ctx, curdirent.d_name, + strlen(curdirent.d_name), + v9fs_qid2ino(&curdirent.qid), + curdirent.d_type)) return 0; - filp->f_pos = curdirent.d_off; + ctx->pos = curdirent.d_off; rdir->head += err; } } @@ -254,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) const struct file_operations v9fs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = v9fs_dir_readdir, + .iterate = v9fs_dir_readdir, .open = v9fs_file_open, .release = v9fs_dir_release, }; @@ -262,7 +246,7 @@ const struct file_operations v9fs_dir_operations = { const struct file_operations v9fs_dir_operations_dotl = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = v9fs_dir_readdir_dotl, + .iterate = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, .fsync = v9fs_file_fsync_dotl, diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 9cf874c..0d138c0 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -17,47 +17,43 @@ static DEFINE_RWLOCK(adfs_dir_lock); static int -adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +adfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct object_info obj; struct adfs_dir dir; int ret = 0; - if (filp->f_pos >> 32) - goto out; + if (ctx->pos >> 32) + return 0; ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); if (ret) - goto out; + return ret; - switch ((unsigned long)filp->f_pos) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) goto free_out; - filp->f_pos += 1; - - case 1: - if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0) + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR)) goto free_out; - filp->f_pos += 1; - - default: - break; + ctx->pos = 2; } read_lock(&adfs_dir_lock); - ret = ops->setpos(&dir, filp->f_pos - 2); + ret = ops->setpos(&dir, ctx->pos - 2); if (ret) goto unlock_out; while (ops->getnext(&dir, &obj) == 0) { - if (filldir(dirent, obj.name, obj.name_len, - filp->f_pos, obj.file_id, DT_UNKNOWN) < 0) - goto unlock_out; - filp->f_pos += 1; + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.file_id, DT_UNKNOWN)) + break; + ctx->pos++; } unlock_out: @@ -65,8 +61,6 @@ unlock_out: free_out: ops->free(&dir); - -out: return ret; } @@ -192,13 +186,12 @@ out: const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = adfs_readdir, + .iterate = adfs_readdir, .fsync = generic_file_fsync, }; static int -adfs_hash(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr) +adfs_hash(const struct dentry *parent, struct qstr *qstr) { const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; const unsigned char *name; @@ -234,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode, * requirements of the underlying filesystem. */ static int -adfs_compare(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +adfs_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { int i; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index fd11a6d..f1eba8c 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -15,12 +15,12 @@ #include "affs.h" -static int affs_readdir(struct file *, void *, filldir_t); +static int affs_readdir(struct file *, struct dir_context *); const struct file_operations affs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .readdir = affs_readdir, + .iterate = affs_readdir, .fsync = affs_file_fsync, }; @@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = { }; static int -affs_readdir(struct file *filp, void *dirent, filldir_t filldir) +affs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - struct buffer_head *dir_bh; - struct buffer_head *fh_bh; + struct buffer_head *dir_bh = NULL; + struct buffer_head *fh_bh = NULL; unsigned char *name; int namelen; u32 i; int hash_pos; int chain_pos; - u32 f_pos; u32 ino; - int stored; - int res; - pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos); + pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos); - stored = 0; - res = -EIO; - dir_bh = NULL; - fh_bh = NULL; - f_pos = filp->f_pos; - - if (f_pos == 0) { - filp->private_data = (void *)0; - if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0) + if (ctx->pos < 2) { + file->private_data = (void *)0; + if (!dir_emit_dots(file, ctx)) return 0; - filp->f_pos = f_pos = 1; - stored++; - } - if (f_pos == 1) { - if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0) - return stored; - filp->f_pos = f_pos = 2; - stored++; } affs_lock_dir(inode); - chain_pos = (f_pos - 2) & 0xffff; - hash_pos = (f_pos - 2) >> 16; + chain_pos = (ctx->pos - 2) & 0xffff; + hash_pos = (ctx->pos - 2) >> 16; if (chain_pos == 0xffff) { affs_warning(sb, "readdir", "More than 65535 entries in chain"); chain_pos = 0; hash_pos++; - filp->f_pos = ((hash_pos << 16) | chain_pos) + 2; + ctx->pos = ((hash_pos << 16) | chain_pos) + 2; } dir_bh = affs_bread(sb, inode->i_ino); if (!dir_bh) @@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* If the directory hasn't changed since the last call to readdir(), * we can jump directly to where we left off. */ - ino = (u32)(long)filp->private_data; - if (ino && filp->f_version == inode->i_version) { + ino = (u32)(long)file->private_data; + if (ino && file->f_version == inode->i_version) { pr_debug("AFFS: readdir() left off=%d\n", ino); goto inside; } @@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir) fh_bh = affs_bread(sb, ino); if (!fh_bh) { affs_error(sb, "readdir","Cannot read block %d", i); - goto readdir_out; + return -EIO; } ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); @@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir) ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]); if (!ino) continue; - f_pos = (hash_pos << 16) + 2; + ctx->pos = (hash_pos << 16) + 2; inside: do { fh_bh = affs_bread(sb, ino); if (!fh_bh) { affs_error(sb, "readdir","Cannot read block %d", ino); - goto readdir_done; + break; } namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); name = AFFS_TAIL(sb, fh_bh)->name + 1; pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", - namelen, name, ino, hash_pos, f_pos); - if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0) + namelen, name, ino, hash_pos, (u32)ctx->pos); + if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) goto readdir_done; - stored++; - f_pos++; + ctx->pos++; ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); fh_bh = NULL; } while (ino); } readdir_done: - filp->f_pos = f_pos; - filp->f_version = inode->i_version; - filp->private_data = (void *)(long)ino; - res = stored; + file->f_version = inode->i_version; + file->private_data = (void *)(long)ino; readdir_out: affs_brelse(dir_bh); affs_brelse(fh_bh); affs_unlock_dir(inode); - pr_debug("AFFS: readdir()=%d\n", stored); - return res; + return 0; } diff --git a/fs/affs/namei.c b/fs/affs/namei.c index ff65884..c36cbb4 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -13,18 +13,12 @@ typedef int (*toupper_t)(int); static int affs_toupper(int ch); -static int affs_hash_dentry(const struct dentry *, - const struct inode *, struct qstr *); -static int affs_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int affs_hash_dentry(const struct dentry *, struct qstr *); +static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int affs_intl_toupper(int ch); -static int affs_intl_hash_dentry(const struct dentry *, - const struct inode *, struct qstr *); -static int affs_intl_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int affs_intl_hash_dentry(const struct dentry *, struct qstr *); +static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); const struct dentry_operations affs_dentry_operations = { @@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper) } static int -affs_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { return __affs_hash_dentry(qstr, affs_toupper); } static int -affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { return __affs_hash_dentry(qstr, affs_intl_toupper); } @@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len, } static int -affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return __affs_compare_dentry(len, str, name, affs_toupper); } static int -affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return __affs_compare_dentry(len, str, name, affs_intl_toupper); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 7a465ed..34494fb 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -22,7 +22,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); -static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); +static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); @@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, .release = afs_release, - .readdir = afs_readdir, + .iterate = afs_readdir, .lock = afs_lock, .llseek = generic_file_llseek, }; @@ -119,9 +119,9 @@ struct afs_dir_page { }; struct afs_lookup_cookie { + struct dir_context ctx; struct afs_fid fid; - const char *name; - size_t nlen; + struct qstr name; int found; }; @@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file) /* * deal with one block in an AFS directory */ -static int afs_dir_iterate_block(unsigned *fpos, +static int afs_dir_iterate_block(struct dir_context *ctx, union afs_dir_block *block, - unsigned blkoff, - void *cookie, - filldir_t filldir) + unsigned blkoff) { union afs_dirent *dire; unsigned offset, next, curr; size_t nlen; - int tmp, ret; + int tmp; - _enter("%u,%x,%p,,",*fpos,blkoff,block); + _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block); - curr = (*fpos - blkoff) / sizeof(union afs_dirent); + curr = (ctx->pos - blkoff) / sizeof(union afs_dirent); /* walk through the block, an entry at a time */ for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; @@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos, _debug("ENT[%Zu.%u]: unused", blkoff / sizeof(union afs_dir_block), offset); if (offset >= curr) - *fpos = blkoff + + ctx->pos = blkoff + next * sizeof(union afs_dirent); continue; } @@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos, continue; /* found the next entry */ - ret = filldir(cookie, - dire->u.name, - nlen, - blkoff + offset * sizeof(union afs_dirent), + if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), - filldir == afs_lookup_filldir ? - ntohl(dire->u.unique) : DT_UNKNOWN); - if (ret < 0) { + ctx->actor == afs_lookup_filldir ? + ntohl(dire->u.unique) : DT_UNKNOWN)) { _leave(" = 0 [full]"); return 0; } - *fpos = blkoff + next * sizeof(union afs_dirent); + ctx->pos = blkoff + next * sizeof(union afs_dirent); } _leave(" = 1 [more]"); @@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos, /* * iterate through the data blob that lists the contents of an AFS directory */ -static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, - filldir_t filldir, struct key *key) +static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, + struct key *key) { union afs_dir_block *dblock; struct afs_dir_page *dbuf; @@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, unsigned blkoff, limit; int ret; - _enter("{%lu},%u,,", dir->i_ino, *fpos); + _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { _leave(" = -ESTALE"); @@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, } /* round the file position up to the next entry boundary */ - *fpos += sizeof(union afs_dirent) - 1; - *fpos &= ~(sizeof(union afs_dirent) - 1); + ctx->pos += sizeof(union afs_dirent) - 1; + ctx->pos &= ~(sizeof(union afs_dirent) - 1); /* walk through the blocks in sequence */ ret = 0; - while (*fpos < dir->i_size) { - blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1); + while (ctx->pos < dir->i_size) { + blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1); /* fetch the appropriate page from the directory */ page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); @@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, do { dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / sizeof(union afs_dir_block)]; - ret = afs_dir_iterate_block(fpos, dblock, blkoff, - cookie, filldir); + ret = afs_dir_iterate_block(ctx, dblock, blkoff); if (ret != 1) { afs_dir_put_page(page); goto out; @@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, blkoff += sizeof(union afs_dir_block); - } while (*fpos < dir->i_size && blkoff < limit); + } while (ctx->pos < dir->i_size && blkoff < limit); afs_dir_put_page(page); ret = 0; @@ -387,23 +380,10 @@ out: /* * read an AFS directory */ -static int afs_readdir(struct file *file, void *cookie, filldir_t filldir) +static int afs_readdir(struct file *file, struct dir_context *ctx) { - unsigned fpos; - int ret; - - _enter("{%Ld,{%lu}}", - file->f_pos, file_inode(file)->i_ino); - - ASSERT(file->private_data != NULL); - - fpos = file->f_pos; - ret = afs_dir_iterate(file_inode(file), &fpos, - cookie, filldir, file->private_data); - file->f_pos = fpos; - - _leave(" = %d", ret); - return ret; + return afs_dir_iterate(file_inode(file), + ctx, file->private_data); } /* @@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, { struct afs_lookup_cookie *cookie = _cookie; - _enter("{%s,%Zu},%s,%u,,%llu,%u", - cookie->name, cookie->nlen, name, nlen, + _enter("{%s,%u},%s,%u,,%llu,%u", + cookie->name.name, cookie->name.len, name, nlen, (unsigned long long) ino, dtype); /* insanity checks first */ BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); BUILD_BUG_ON(sizeof(union afs_dirent) != 32); - if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) { + if (cookie->name.len != nlen || + memcmp(cookie->name.name, name, nlen) != 0) { _leave(" = 0 [no]"); return 0; } @@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, static int afs_do_lookup(struct inode *dir, struct dentry *dentry, struct afs_fid *fid, struct key *key) { - struct afs_lookup_cookie cookie; - struct afs_super_info *as; - unsigned fpos; + struct afs_super_info *as = dir->i_sb->s_fs_info; + struct afs_lookup_cookie cookie = { + .ctx.actor = afs_lookup_filldir, + .name = dentry->d_name, + .fid.vid = as->volume->vid + }; int ret; _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name); - as = dir->i_sb->s_fs_info; - /* search the directory */ - cookie.name = dentry->d_name.name; - cookie.nlen = dentry->d_name.len; - cookie.fid.vid = as->volume->vid; - cookie.found = 0; - - fpos = 0; - ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir, - key); + ret = afs_dir_iterate(dir, &cookie.ctx, key); if (ret < 0) { _leave(" = %d [iter]", ret); return ret; diff --git a/fs/afs/file.c b/fs/afs/file.c index 8f6e923..66d50fe 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,7 +19,8 @@ #include "internal.h" static int afs_readpage(struct file *file, struct page *page); -static void afs_invalidatepage(struct page *page, unsigned long offset); +static void afs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); static int afs_launder_page(struct page *page); @@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page) * - release a page and clean up its private data if offset is 0 (indicating * the entire page) */ -static void afs_invalidatepage(struct page *page, unsigned long offset) +static void afs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct afs_writeback *wb = (struct afs_writeback *) page_private(page); - _enter("{%lu},%lu", page->index, offset); + _enter("{%lu},%u,%u", page->index, offset, length); BUG_ON(!PageLocked(page)); /* we clean up only if the entire page is being invalidated */ - if (offset == 0) { + if (offset == 0 && length == PAGE_CACHE_SIZE) { #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 2497bf3..a8cf2cf 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct inode *inode = file_inode(file); + struct afs_vnode *vnode = AFS_FS_I(inode); afs_lock_type_t type; struct key *key = file->private_data; int ret; @@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; - lock_flocks(); + spin_lock(&inode->i_lock); /* make sure we've got a callback on this file and that our view of the * data version is up to date */ @@ -420,7 +421,7 @@ given_lock: afs_vnode_fetch_status(vnode, NULL, key); error: - unlock_flocks(); + spin_unlock(&inode->i_lock); _leave(" = %d", ret); return ret; @@ -39,6 +39,8 @@ #include <asm/kmap_types.h> #include <asm/uaccess.h> +#include "internal.h" + #define AIO_RING_MAGIC 0xa10a10a1 #define AIO_RING_COMPAT_FEATURES 1 #define AIO_RING_INCOMPAT_FEATURES 0 @@ -141,9 +143,6 @@ static void aio_free_ring(struct kioctx *ctx) for (i = 0; i < ctx->nr_pages; i++) put_page(ctx->ring_pages[i]); - if (ctx->mmap_size) - vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); } @@ -307,7 +306,9 @@ static void free_ioctx(struct kioctx *ctx) kunmap_atomic(ring); while (atomic_read(&ctx->reqs_active) > 0) { - wait_event(ctx->wait, head != ctx->tail); + wait_event(ctx->wait, + head != ctx->tail || + atomic_read(&ctx->reqs_active) <= 0); avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; @@ -320,11 +321,6 @@ static void free_ioctx(struct kioctx *ctx) aio_free_ring(ctx); - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - ctx->max_reqs > aio_nr); - aio_nr -= ctx->max_reqs; - spin_unlock(&aio_nr_lock); - pr_debug("freeing %p\n", ctx); /* @@ -433,17 +429,24 @@ static void kill_ioctx(struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { hlist_del_rcu(&ctx->list); - /* Between hlist_del_rcu() and dropping the initial ref */ - synchronize_rcu(); /* - * We can't punt to workqueue here because put_ioctx() -> - * free_ioctx() will unmap the ringbuffer, and that has to be - * done in the original process's context. kill_ioctx_rcu/work() - * exist for exit_aio(), as in that path free_ioctx() won't do - * the unmap. + * It'd be more correct to do this in free_ioctx(), after all + * the outstanding kiocbs have finished - but by then io_destroy + * has already returned, so io_setup() could potentially return + * -EAGAIN with no ioctxs actually in use (as far as userspace + * could tell). */ - kill_ioctx_work(&ctx->rcu_work); + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); + + /* Between hlist_del_rcu() and dropping the initial ref */ + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); } } @@ -493,10 +496,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); - } + kill_ioctx(ctx); } } @@ -625,7 +625,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* * Add a completion event to the ring buffer. Must be done holding - * ctx->ctx_lock to prevent other code from messing with the tail + * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. */ spin_lock_irqsave(&ctx->completion_lock, flags); @@ -1299,8 +1299,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by - * timeout is relative and will be updated if not NULL and the - * operation blocks. Will fail with -ENOSYS if not implemented. + * timeout is relative. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 13ddec9..3d9d3f5 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -109,7 +109,7 @@ cont: spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); /* Already gone or negative dentry (under construction) - try next */ - if (q->d_count == 0 || !simple_positive(q)) { + if (!d_count(q) || !simple_positive(q)) { spin_unlock(&q->d_lock); next = q->d_u.d_child.next; goto cont; @@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, else ino_count++; - if (p->d_count > ino_count) { + if (d_count(p) > ino_count) { top_ino->last_used = jiffies; dput(p); return 1; @@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, if (!exp_leaves) { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; - if (dentry->d_count > ino_count) + if (d_count(dentry) > ino_count) goto next; if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { @@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, } else { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; - if (dentry->d_count > ino_count) + if (d_count(dentry) > ino_count) goto next; expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 085da86..92ef341 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, - .readdir = dcache_readdir, + .iterate = dcache_readdir, .llseek = dcache_dir_lseek, .unlocked_ioctl = autofs4_root_ioctl, #ifdef CONFIG_COMPAT @@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = { .open = autofs4_dir_open, .release = dcache_dir_close, .read = generic_read_dir, - .readdir = dcache_readdir, + .iterate = dcache_readdir, .llseek = dcache_dir_lseek, }; @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (active->d_count == 0) + if (!d_count(active)) goto next; qstr = &active->d_name; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 922ad46..7c93953 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return -EIO; } -static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int bad_file_readdir(struct file *file, struct dir_context *ctx) { return -EIO; } @@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops = .write = bad_file_write, .aio_read = bad_file_aio_read, .aio_write = bad_file_aio_write, - .readdir = bad_file_readdir, + .iterate = bad_file_readdir, .poll = bad_file_poll, .unlocked_ioctl = bad_file_unlocked_ioctl, .compat_ioctl = bad_file_compat_ioctl, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8615ee8..e9c75e2 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -31,7 +31,7 @@ MODULE_LICENSE("GPL"); /* The units the vfs expects inode->i_blocks to be in */ #define VFS_BLOCK_SIZE 512 -static int befs_readdir(struct file *, void *, filldir_t); +static int befs_readdir(struct file *, struct dir_context *); static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); static int befs_readpage(struct file *file, struct page *page); static sector_t befs_bmap(struct address_space *mapping, sector_t block); @@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep; static const struct file_operations befs_dir_operations = { .read = generic_read_dir, - .readdir = befs_readdir, + .iterate = befs_readdir, .llseek = generic_file_llseek, }; @@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } static int -befs_readdir(struct file *filp, void *dirent, filldir_t filldir) +befs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; befs_off_t value; @@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir) size_t keysize; unsigned char d_type; char keybuf[BEFS_NAME_LEN + 1]; - char *nlsname; - int nlsnamelen; - const char *dirname = filp->f_path.dentry->d_name.name; + const char *dirname = file->f_path.dentry->d_name.name; befs_debug(sb, "---> befs_readdir() " - "name %s, inode %ld, filp->f_pos %Ld", - dirname, inode->i_ino, filp->f_pos); + "name %s, inode %ld, ctx->pos %Ld", + dirname, inode->i_ino, ctx->pos); - result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1, +more: + result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, keybuf, &keysize, &value); if (result == BEFS_ERR) { @@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* Convert to NLS */ if (BEFS_SB(sb)->nls) { + char *nlsname; + int nlsnamelen; result = befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); if (result < 0) { befs_debug(sb, "<--- befs_readdir() ERROR"); return result; } - result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos, - (ino_t) value, d_type); + if (!dir_emit(ctx, nlsname, nlsnamelen, + (ino_t) value, d_type)) { + kfree(nlsname); + return 0; + } kfree(nlsname); - } else { - result = filldir(dirent, keybuf, keysize, filp->f_pos, - (ino_t) value, d_type); + if (!dir_emit(ctx, keybuf, keysize, + (ino_t) value, d_type)) + return 0; } + ctx->pos++; + goto more; - filp->f_pos++; - - befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos); + befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos); return 0; } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 3f422f6..a399e6d 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir, const unsigned char *name, int namelen, struct bfs_dirent **res_dir); -static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) +static int bfs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); struct buffer_head *bh; struct bfs_dirent *de; - struct bfs_sb_info *info = BFS_SB(dir->i_sb); unsigned int offset; int block; - mutex_lock(&info->bfs_lock); - - if (f->f_pos & (BFS_DIRENT_SIZE - 1)) { + if (ctx->pos & (BFS_DIRENT_SIZE - 1)) { printf("Bad f_pos=%08lx for %s:%08lx\n", - (unsigned long)f->f_pos, + (unsigned long)ctx->pos, dir->i_sb->s_id, dir->i_ino); - mutex_unlock(&info->bfs_lock); - return -EBADF; + return -EINVAL; } - while (f->f_pos < dir->i_size) { - offset = f->f_pos & (BFS_BSIZE - 1); - block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS); + while (ctx->pos < dir->i_size) { + offset = ctx->pos & (BFS_BSIZE - 1); + block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS); bh = sb_bread(dir->i_sb, block); if (!bh) { - f->f_pos += BFS_BSIZE - offset; + ctx->pos += BFS_BSIZE - offset; continue; } do { de = (struct bfs_dirent *)(bh->b_data + offset); if (de->ino) { int size = strnlen(de->name, BFS_NAMELEN); - if (filldir(dirent, de->name, size, f->f_pos, + if (!dir_emit(ctx, de->name, size, le16_to_cpu(de->ino), - DT_UNKNOWN) < 0) { + DT_UNKNOWN)) { brelse(bh); - mutex_unlock(&info->bfs_lock); return 0; } } offset += BFS_DIRENT_SIZE; - f->f_pos += BFS_DIRENT_SIZE; - } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size)); + ctx->pos += BFS_DIRENT_SIZE; + } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size)); brelse(bh); } - - mutex_unlock(&info->bfs_lock); - return 0; + return 0; } const struct file_operations bfs_dir_operations = { .read = generic_read_dir, - .readdir = bfs_readdir, + .iterate = bfs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index bce8769..89dec7f 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm) (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); if (retval < 0) { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8a0b0e..100edcc 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - current->mm->free_area_cache = current->mm->mmap_base; - current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 2091db8..bb43ce0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -325,31 +325,10 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, static loff_t block_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = file->f_mapping->host; - loff_t size; loff_t retval; mutex_lock(&bd_inode->i_mutex); - size = i_size_read(bd_inode); - - retval = -EINVAL; - switch (whence) { - case SEEK_END: - offset += size; - break; - case SEEK_CUR: - offset += file->f_pos; - case SEEK_SET: - break; - default: - goto out; - } - if (offset >= 0 && offset <= size) { - if (offset != file->f_pos) { - file->f_pos = offset; - } - retval = offset; - } -out: + retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); mutex_unlock(&bd_inode->i_mutex); return retval; } @@ -1583,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = { .writepages = generic_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, + .is_dirty_writeback = buffer_check_dirty_writeback, }; const struct file_operations def_blk_fops = { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b4fb415..eaf1333 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * to a logical address */ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, - int search_commit_root, - u64 time_seq, - struct __prelim_ref *ref, - struct ulist *parents, - const u64 *extent_item_pos) + struct btrfs_path *path, u64 time_seq, + struct __prelim_ref *ref, + struct ulist *parents, + const u64 *extent_item_pos) { - struct btrfs_path *path; struct btrfs_root *root; struct btrfs_key root_key; struct extent_buffer *eb; @@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int root_level; int level = ref->level; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->search_commit_root = !!search_commit_root; - root_key.objectid = ref->root_id; root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = (u64)-1; @@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, time_seq, ref->wanted_disk_byte, extent_item_pos); out: - btrfs_free_path(path); + path->lowest_level = 0; + btrfs_release_path(path); return ret; } @@ -322,7 +316,7 @@ out: * resolve all indirect backrefs from the list */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - int search_commit_root, u64 time_seq, + struct btrfs_path *path, u64 time_seq, struct list_head *head, const u64 *extent_item_pos) { @@ -349,9 +343,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; if (ref->count == 0) continue; - err = __resolve_indirect_ref(fs_info, search_commit_root, - time_seq, ref, parents, - extent_item_pos); + err = __resolve_indirect_ref(fs_info, path, time_seq, ref, + parents, extent_item_pos); if (err == -ENOMEM) goto out; if (err) @@ -604,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, int slot; struct extent_buffer *leaf; struct btrfs_key key; + struct btrfs_key found_key; unsigned long ptr; unsigned long end; struct btrfs_extent_item *ei; @@ -621,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); + btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)ptr; *info_level = btrfs_tree_block_level(leaf, info); ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); + } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) { + *info_level = found_key.offset; } else { BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); } @@ -795,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; - int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT); struct list_head prefs_delayed; struct list_head prefs; struct __prelim_ref *ref; @@ -804,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&prefs_delayed); key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)-1; + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->search_commit_root = !!search_commit_root; + if (!trans) + path->search_commit_root = 1; /* * grab both a lock on the path and a lock on the delayed ref head. @@ -825,7 +826,7 @@ again: goto out; BUG_ON(ret == 0); - if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { + if (trans) { /* * look if there are updates for this ref queued and lock the * head @@ -869,7 +870,8 @@ again: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == bytenr && - key.type == BTRFS_EXTENT_ITEM_KEY) { + (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY)) { ret = __add_inline_refs(fs_info, path, bytenr, &info_level, &prefs); if (ret) @@ -890,8 +892,8 @@ again: __merge_refs(&prefs, 1); - ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, - &prefs, extent_item_pos); + ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, + extent_item_pos); if (ret) goto out; @@ -918,7 +920,8 @@ again: ref->parent, bsz, 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); - return -EIO; + ret = -EIO; + goto out; } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); @@ -1282,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, { int ret; u64 flags; + u64 size = 0; u32 item_size; struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_key key; - key.type = BTRFS_EXTENT_ITEM_KEY; + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; key.objectid = logical; key.offset = (u64)-1; @@ -1300,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, return ret; btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); - if (found_key->type != BTRFS_EXTENT_ITEM_KEY || + if (found_key->type == BTRFS_METADATA_ITEM_KEY) + size = fs_info->extent_root->leafsize; + else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) + size = found_key->offset; + + if ((found_key->type != BTRFS_EXTENT_ITEM_KEY && + found_key->type != BTRFS_METADATA_ITEM_KEY) || found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) { + found_key->objectid + size <= logical) { pr_debug("logical %llu is not within any extent\n", (unsigned long long)logical); return -ENOENT; @@ -1458,7 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct ulist *refs = NULL; struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; @@ -1470,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, pr_debug("resolving all inodes for extent %llu\n", extent_item_objectid); - if (search_commit_root) { - trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT; - } else { + if (!search_commit_root) { trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 0f446d7..8f2e767 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -23,8 +23,6 @@ #include "ulist.h" #include "extent_io.h" -#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) - struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 18af6f4..1431a69 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1700,7 +1700,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, unsigned int j; DECLARE_COMPLETION_ONSTACK(complete); - bio = bio_alloc(GFP_NOFS, num_pages - i); + bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); if (!bio) { printk(KERN_INFO "btrfsic: bio_alloc() for %u pages failed!\n", diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index de6de8e..5bf4c39 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -951,10 +951,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { + int level = btrfs_header_level(buf); + ret = btrfs_set_disk_extent_flags(trans, root, buf->start, buf->len, - new_flags, 0); + new_flags, level, 0); if (ret) return ret; } @@ -1087,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - tree_mod_log_free_eb(root->fs_info, buf); + if (last_ref) + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1159,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, * time_seq). */ static void -__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, - struct tree_mod_elem *first_tm) +__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq, struct tree_mod_elem *first_tm) { u32 n; struct rb_node *next; @@ -1170,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); + tree_mod_log_read_lock(fs_info); while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for @@ -1224,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, if (tm->index != first_tm->index) break; } + tree_mod_log_read_unlock(fs_info); btrfs_set_header_nritems(eb, n); } @@ -1272,7 +1277,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, extent_buffer_get(eb_rewin); btrfs_tree_read_lock(eb_rewin); - __tree_mod_log_rewind(eb_rewin, time_seq, tm); + __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); @@ -1348,7 +1353,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_generation(eb, old_generation); } if (tm) - __tree_mod_log_rewind(eb, time_seq, tm); + __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); else WARN_ON(btrfs_header_level(eb) != 0); WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); @@ -2176,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root, } } -/* - * returns -EAGAIN if it had to drop the path, or zero if everything was in - * cache - */ -static noinline int reada_for_balance(struct btrfs_root *root, - struct btrfs_path *path, int level) +static noinline void reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) { int slot; int nritems; @@ -2190,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, u64 gen; u64 block1 = 0; u64 block2 = 0; - int ret = 0; int blocksize; parent = path->nodes[level + 1]; if (!parent) - return 0; + return; nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; @@ -2222,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, block2 = 0; free_extent_buffer(eb); } - if (block1 || block2) { - ret = -EAGAIN; - - /* release the whole path */ - btrfs_release_path(path); - /* read the blocks */ - if (block1) - readahead_tree_block(root, block1, blocksize, 0); - if (block2) - readahead_tree_block(root, block2, blocksize, 0); - - if (block1) { - eb = read_tree_block(root, block1, blocksize, 0); - free_extent_buffer(eb); - } - if (block2) { - eb = read_tree_block(root, block2, blocksize, 0); - free_extent_buffer(eb); - } - } - return ret; + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); } @@ -2357,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans, tmp = btrfs_find_tree_block(root, blocknr, blocksize); if (tmp) { /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - /* - * we found an up to date block without - * sleeping, return - * right away - */ - *eb_ret = tmp; - return 0; - } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - free_extent_buffer(tmp); - btrfs_set_path_blocking(p); + if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + *eb_ret = tmp; + return 0; + } - /* now we're allowed to do a blocking uptodate check */ - tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { - *eb_ret = tmp; - return 0; - } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number + */ + btrfs_set_path_blocking(p); + + /* now we're allowed to do a blocking uptodate check */ + ret = btrfs_read_buffer(tmp, gen); + if (!ret) { + *eb_ret = tmp; + return 0; } + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } /* @@ -2446,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = split_node(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2470,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, goto again; } - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - btrfs_set_path_blocking(p); + reada_for_balance(root, p, level); sret = balance_level(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -3141,7 +3111,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, */ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int log_removal) + struct btrfs_path *path, int level) { u64 lower_gen; struct extent_buffer *lower; @@ -3192,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; - tree_mod_log_set_root_pointer(root, c, log_removal); + tree_mod_log_set_root_pointer(root, c, 0); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -3276,14 +3246,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans, /* * trying to split the root, lets make a new one * - * tree mod log: We pass 0 as log_removal parameter to + * tree mod log: We don't log_removal old root in * insert_new_root, because that root buffer will be kept as a * normal node. We are going to log removal of half of the * elements below with tree_mod_log_eb_copy. We're holding a * tree lock on the buffer, which is why we cannot race with * other tree_mod_log users. */ - ret = insert_new_root(trans, root, path, level + 1, 0); + ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; } else { @@ -3984,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size) { + if (data_size && path->nodes[1]) { wret = push_leaf_right(trans, root, path, data_size, data_size, 0, 0); if (wret < 0) @@ -4003,7 +3973,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, } if (!path->nodes[1]) { - ret = insert_new_root(trans, root, path, 1, 1); + ret = insert_new_root(trans, root, path, 1); if (ret) return ret; } @@ -4428,7 +4398,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, } /* - * make the item pointed to by the path bigger, data_size is the new size. + * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, u32 data_size) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 63c328a..e795bf1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -88,12 +88,12 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - /* holds quota configuration and tracking */ #define BTRFS_QUOTA_TREE_OBJECTID 8ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -961,8 +961,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) +#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE enum btrfs_raid_types { @@ -1102,6 +1102,18 @@ struct btrfs_space_info { account */ /* + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed everytime we call + * btrfs_free_extent so it is a realtime count of what will be freed + * once the transaction is committed. It will be zero'ed everytime the + * transaction commits. + */ + struct percpu_counter total_bytes_pinned; + + /* * we bump reservation progress every time we decrement * bytes_reserved. This way people waiting for reservations * know something good has happened and they can check @@ -1437,25 +1449,22 @@ struct btrfs_fs_info { atomic_t open_ioctl_trans; /* - * this is used by the balancing code to wait for all the pending - * ordered extents + * this is used to protect the following list -- ordered_roots. */ - spinlock_t ordered_extent_lock; + spinlock_t ordered_root_lock; /* - * all of the data=ordered extents pending writeback + * all fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ - struct list_head ordered_extents; + struct list_head ordered_roots; - spinlock_t delalloc_lock; - /* - * all of the inodes that have delalloc bytes. It is possible for - * this list to be empty even when there is still dirty data=ordered - * extents waiting to finish IO. - */ - struct list_head delalloc_inodes; + spinlock_t delalloc_root_lock; + /* all fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; /* * there is a pool of worker threads for checksumming during writes @@ -1498,8 +1507,6 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; - int enospc_unlink; - int trans_no_join; u64 total_pinned; @@ -1594,6 +1601,12 @@ struct btrfs_fs_info { struct rb_root qgroup_tree; spinlock_t qgroup_lock; + /* + * used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + /* protect user change for quota operations */ struct mutex qgroup_ioctl_lock; @@ -1607,6 +1620,8 @@ struct btrfs_fs_info { struct mutex qgroup_rescan_lock; /* protects the progress item */ struct btrfs_key qgroup_rescan_progress; struct btrfs_workers qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; /* filesystem state */ unsigned long fs_state; @@ -1739,6 +1754,31 @@ struct btrfs_root { int force_cow; spinlock_t root_item_lock; + atomic_t refs; + + spinlock_t delalloc_lock; + /* + * all of the inodes that have delalloc bytes. It is possible for + * this list to be empty even when there is still dirty data=ordered + * extents waiting to finish IO. + */ + struct list_head delalloc_inodes; + struct list_head delalloc_root; + u64 nr_delalloc_inodes; + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + + /* + * all of the data=ordered extents pending writeback + * these can span multiple transactions and basically include + * every dirty data page that isn't from nodatacow + */ + struct list_head ordered_extents; + struct list_head ordered_root; + u64 nr_ordered_extents; }; struct btrfs_ioctl_defrag_range_args { @@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, num_items; } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); @@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct btrfs_root *root, + struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); @@ -3075,7 +3119,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data); + int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, @@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); @@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root) +{ + return (root->fs_info->sb->s_flags & MS_RDONLY || + btrfs_fs_closing(root->fs_info)); +} + static inline void free_fs_info(struct btrfs_fs_info *fs_info) { kfree(fs_info->balance_ctl); @@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root_item *item); void btrfs_read_root_item(struct extent_buffer *eb, int slot, struct btrfs_root_item *item); -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct - btrfs_root_item *item, struct btrfs_key *key); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); @@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); +noinline int can_nocow_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes); /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ #if defined(ClearPageFsMisc) && !defined(ClearPageChecked) @@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, + int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, @@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f26f38c..3755109 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item( return next; } -static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, - u64 root_id) -{ - struct btrfs_key root_key; - - if (root->objectid == root_id) - return root; - - root_key.objectid = root_id; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - return btrfs_read_fs_root_no_name(root->fs_info, &root_key); -} - static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_item *item) @@ -1681,8 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree * */ -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, - filldir_t filldir, +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list) { struct btrfs_dir_item *di; @@ -1704,13 +1689,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (curr->key.offset < filp->f_pos) { + if (curr->key.offset < ctx->pos) { if (atomic_dec_and_test(&curr->refs)) kfree(curr); continue; } - filp->f_pos = curr->key.offset; + ctx->pos = curr->key.offset; di = (struct btrfs_dir_item *)curr->data; name = (char *)(di + 1); @@ -1719,7 +1704,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, d_type = btrfs_filetype_table[di->type]; btrfs_disk_key_to_cpu(&location, &di->location); - over = filldir(dirent, name, name_len, curr->key.offset, + over = !dir_emit(ctx, name, name_len, location.objectid, d_type); if (atomic_dec_and_test(&curr->refs)) diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 1d5c5f7..a4b38f9 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list, struct list_head *del_list); int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, - filldir_t filldir, +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list); /* for init */ diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f75fcaf..70b962c 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -60,6 +60,7 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; u64 flags_to_set; + int level; unsigned int update_key:1; unsigned int update_flags:1; unsigned int is_data:1; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7ba7b39..4253ad5 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -313,6 +313,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; + if (btrfs_fs_incompat(fs_info, RAID56)) { + pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); + return -EINVAL; + } + switch (args->start.cont_reading_from_srcdev_mode) { case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: @@ -395,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -465,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e9ebe1..6b092a1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -152,7 +152,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, - { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, @@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) return try_release_extent_buffer(page); } -static void btree_invalidatepage(struct page *page, unsigned long offset) +static void btree_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -1191,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; + root->nr_delalloc_inodes = 0; + root->nr_ordered_extents = 0; root->name = NULL; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); @@ -1199,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->delalloc_inodes); + INIT_LIST_HEAD(&root->delalloc_root); + INIT_LIST_HEAD(&root->ordered_extents); + INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); + spin_lock_init(&root->delalloc_lock); + spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); spin_lock_init(&root->log_extents_lock[0]); spin_lock_init(&root->log_extents_lock[1]); @@ -1216,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); + atomic_set(&root->refs, 1); root->log_transid = 0; root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, @@ -1234,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->root_item_lock); } -static int __must_check find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) -{ - int ret; - u32 blocksize; - u64 generation; - - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, objectid); - ret = btrfs_find_last_root(tree_root, objectid, - &root->root_item, &root->root_key); - if (ret > 0) - return -ENOENT; - else if (ret < 0) - return ret; - - generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); - root->commit_root = NULL; - root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); - if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) { - free_extent_buffer(root->node); - root->node = NULL; - return -EIO; - } - root->commit_root = btrfs_root_node(root); - return 0; -} - static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); @@ -1451,71 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location) +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; - struct extent_buffer *l; u64 generation; u32 blocksize; - int ret = 0; - int slot; + int ret; - root = btrfs_alloc_root(fs_info); - if (!root) + path = btrfs_alloc_path(); + if (!path) return ERR_PTR(-ENOMEM); - if (location->offset == (u64)-1) { - ret = find_and_setup_root(tree_root, fs_info, - location->objectid, root); - if (ret) { - kfree(root); - return ERR_PTR(ret); - } - goto out; + + root = btrfs_alloc_root(fs_info); + if (!root) { + ret = -ENOMEM; + goto alloc_fail; } __setup_root(tree_root->nodesize, tree_root->leafsize, tree_root->sectorsize, tree_root->stripesize, - root, fs_info, location->objectid); + root, fs_info, key->objectid); - path = btrfs_alloc_path(); - if (!path) { - kfree(root); - return ERR_PTR(-ENOMEM); - } - ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); - if (ret == 0) { - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_read_root_item(l, slot, &root->root_item); - memcpy(&root->root_key, location, sizeof(*location)); - } - btrfs_free_path(path); + ret = btrfs_find_root(tree_root, key, path, + &root->root_item, &root->root_key); if (ret) { - kfree(root); if (ret > 0) ret = -ENOENT; - return ERR_PTR(ret); + goto find_fail; } generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - if (!root->node || !extent_buffer_uptodate(root->node)) { - ret = (!root->node) ? -ENOMEM : -EIO; - - free_extent_buffer(root->node); - kfree(root); - return ERR_PTR(ret); + if (!root->node) { + ret = -ENOMEM; + goto find_fail; + } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + ret = -EIO; + goto read_fail; } - root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); /* -ENOMEM */ out: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { + btrfs_free_path(path); + return root; + +read_fail: + free_extent_buffer(root->node); +find_fail: + kfree(root); +alloc_fail: + root = ERR_PTR(ret); + goto out; +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + + root = btrfs_read_tree_root(tree_root, location); + if (IS_ERR(root)) + return root; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; btrfs_check_and_init_root_item(&root->root_item); } @@ -1523,6 +1502,66 @@ out: return root; } +int btrfs_init_fs_root(struct btrfs_root *root) +{ + int ret; + + root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); + root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), + GFP_NOFS); + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; + goto fail; + } + + btrfs_init_free_ino_ctl(root); + mutex_init(&root->fs_commit_mutex); + spin_lock_init(&root->cache_lock); + init_waitqueue_head(&root->cache_wait); + + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + return 0; +fail: + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); + return ret; +} + +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) +{ + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_radix_lock); + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_id); + spin_unlock(&fs_info->fs_roots_radix_lock); + return root; +} + +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + int ret; + + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + return ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret == 0) + root->in_radix = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + + return ret; +} + struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location) { @@ -1543,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->quota_root ? fs_info->quota_root : ERR_PTR(-ENOENT); again: - spin_lock(&fs_info->fs_roots_radix_lock); - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)location->objectid); - spin_unlock(&fs_info->fs_roots_radix_lock); + root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) return root; - root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + root = btrfs_read_fs_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; - root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), - GFP_NOFS); - if (!root->free_ino_pinned || !root->free_ino_ctl) { - ret = -ENOMEM; + if (btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; goto fail; } - btrfs_init_free_ino_ctl(root); - mutex_init(&root->fs_commit_mutex); - spin_lock_init(&root->cache_lock); - init_waitqueue_head(&root->cache_wait); - - ret = get_anon_bdev(&root->anon_dev); + ret = btrfs_init_fs_root(root); if (ret) goto fail; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = -ENOENT; - goto fail; - } - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); if (ret < 0) goto fail; if (ret == 0) root->orphan_item_inserted = 1; - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) - goto fail; - - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - root); - if (ret == 0) - root->in_radix = 1; - - spin_unlock(&fs_info->fs_roots_radix_lock); - radix_tree_preload_end(); + ret = btrfs_insert_fs_root(fs_info, root); if (ret) { if (ret == -EEXIST) { free_fs_root(root); @@ -1602,10 +1613,6 @@ again: } goto fail; } - - ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid); - WARN_ON(ret); return root; fail: free_fs_root(root); @@ -1677,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; + int again; do { - int again = 0; - - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && - down_read_trylock(&root->fs_info->sb->s_umount)) { - if (mutex_trylock(&root->fs_info->cleaner_mutex)) { - btrfs_run_delayed_iputs(root); - again = btrfs_clean_one_deleted_snapshot(root); - mutex_unlock(&root->fs_info->cleaner_mutex); - } - btrfs_run_defrag_inodes(root->fs_info); - up_read(&root->fs_info->sb->s_umount); + again = 0; + + /* Make the cleaner go to sleep early. */ + if (btrfs_need_cleaner_sleep(root)) + goto sleep; + + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) + goto sleep; + + /* + * Avoid the problem that we change the status of the fs + * during the above check and trylock. + */ + if (btrfs_need_cleaner_sleep(root)) { + mutex_unlock(&root->fs_info->cleaner_mutex); + goto sleep; } + btrfs_run_delayed_iputs(root); + again = btrfs_clean_one_deleted_snapshot(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + /* + * The defragger has dealt with the R/O remount and umount, + * needn't do anything special here. + */ + btrfs_run_defrag_inodes(root->fs_info); +sleep: if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) @@ -1725,7 +1748,7 @@ static int transaction_kthread(void *arg) } now = get_seconds(); - if (!cur->blocked && + if (cur->state < TRANS_STATE_BLOCKED && (now < cur->start_time || now - cur->start_time < 30)) { spin_unlock(&root->fs_info->trans_lock); delay = HZ * 5; @@ -1988,30 +2011,33 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) { free_extent_buffer(info->tree_root->node); free_extent_buffer(info->tree_root->commit_root); - free_extent_buffer(info->dev_root->node); - free_extent_buffer(info->dev_root->commit_root); - free_extent_buffer(info->extent_root->node); - free_extent_buffer(info->extent_root->commit_root); - free_extent_buffer(info->csum_root->node); - free_extent_buffer(info->csum_root->commit_root); - if (info->quota_root) { - free_extent_buffer(info->quota_root->node); - free_extent_buffer(info->quota_root->commit_root); - } - info->tree_root->node = NULL; info->tree_root->commit_root = NULL; - info->dev_root->node = NULL; - info->dev_root->commit_root = NULL; - info->extent_root->node = NULL; - info->extent_root->commit_root = NULL; - info->csum_root->node = NULL; - info->csum_root->commit_root = NULL; + + if (info->dev_root) { + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + } + if (info->extent_root) { + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + } + if (info->csum_root) { + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + } if (info->quota_root) { + free_extent_buffer(info->quota_root->node); + free_extent_buffer(info->quota_root->commit_root); info->quota_root->node = NULL; info->quota_root->commit_root = NULL; } - if (chunk_root) { free_extent_buffer(info->chunk_root->node); free_extent_buffer(info->chunk_root->commit_root); @@ -2032,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) list_del(&gang[0]->root_list); if (gang[0]->in_radix) { - btrfs_free_fs_root(fs_info, gang[0]); + btrfs_drop_and_free_fs_root(fs_info, gang[0]); } else { free_extent_buffer(gang[0]->node); free_extent_buffer(gang[0]->commit_root); - kfree(gang[0]); + btrfs_put_fs_root(gang[0]); } } @@ -2047,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) if (!ret) break; for (i = 0; i < ret; i++) - btrfs_free_fs_root(fs_info, gang[i]); + btrfs_drop_and_free_fs_root(fs_info, gang[i]); } } @@ -2079,14 +2105,8 @@ int open_ctree(struct super_block *sb, int backup_index = 0; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); - extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); - csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); - dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); - quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info); - - if (!tree_root || !extent_root || !csum_root || - !chunk_root || !dev_root || !quota_root) { + if (!tree_root || !chunk_root) { err = -ENOMEM; goto fail; } @@ -2129,9 +2149,9 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); - spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); @@ -2167,7 +2187,6 @@ int open_ctree(struct super_block *sb, fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->trans_no_join = 0; fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; @@ -2178,8 +2197,8 @@ int open_ctree(struct super_block *sb, fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); - INIT_LIST_HEAD(&fs_info->ordered_extents); - spin_lock_init(&fs_info->ordered_extent_lock); + INIT_LIST_HEAD(&fs_info->ordered_roots); + spin_lock_init(&fs_info->ordered_root_lock); fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_NOFS); if (!fs_info->delayed_root) { @@ -2272,6 +2291,7 @@ int open_ctree(struct super_block *sb, fs_info->qgroup_seq = 1; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; mutex_init(&fs_info->qgroup_rescan_lock); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); @@ -2636,33 +2656,44 @@ retry_root_backup: btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_EXTENT_TREE_OBJECTID, extent_root); - if (ret) + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + extent_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(extent_root)) { + ret = PTR_ERR(extent_root); goto recovery_tree_root; + } extent_root->track_dirty = 1; + fs_info->extent_root = extent_root; - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_DEV_TREE_OBJECTID, dev_root); - if (ret) + location.objectid = BTRFS_DEV_TREE_OBJECTID; + dev_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(dev_root)) { + ret = PTR_ERR(dev_root); goto recovery_tree_root; + } dev_root->track_dirty = 1; + fs_info->dev_root = dev_root; + btrfs_init_devices_late(fs_info); - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_CSUM_TREE_OBJECTID, csum_root); - if (ret) + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + csum_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(csum_root)) { + ret = PTR_ERR(csum_root); goto recovery_tree_root; + } csum_root->track_dirty = 1; + fs_info->csum_root = csum_root; - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_QUOTA_TREE_OBJECTID, quota_root); - if (ret) { - kfree(quota_root); - quota_root = fs_info->quota_root = NULL; - } else { + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + quota_root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(quota_root)) { quota_root->track_dirty = 1; fs_info->quota_enabled = 1; fs_info->pending_quota_state = 1; + fs_info->quota_root = quota_root; } fs_info->generation = generation; @@ -2815,11 +2846,9 @@ retry_root_backup: location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; + location.offset = 0; fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (!fs_info->fs_root) - goto fail_qgroup; if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); goto fail_qgroup; @@ -2851,14 +2880,16 @@ retry_root_backup: return ret; } + btrfs_qgroup_rescan_resume(fs_info); + return 0; fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); - del_fs_roots(fs_info); btrfs_cleanup_transaction(fs_info->tree_root); + del_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -3128,7 +3159,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) * caller */ device->flush_bio = NULL; - bio = bio_alloc(GFP_NOFS, 0); + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); if (!bio) return -ENOMEM; @@ -3256,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( BTRFS_BLOCK_GROUP_RAID10)) { num_tolerated_disk_barrier_failures = 1; } else if (flags & - BTRFS_BLOCK_GROUP_RAID5) { + BTRFS_BLOCK_GROUP_RAID6) { num_tolerated_disk_barrier_failures = 2; } } @@ -3364,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +/* Drop a fs root from the radix tree and free it. */ +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, @@ -3395,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root) kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); kfree(root->name); - kfree(root); + btrfs_put_fs_root(root); +} + +void btrfs_free_fs_root(struct btrfs_root *root) +{ + free_fs_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) @@ -3510,15 +3548,15 @@ int close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - free_root_pointers(fs_info, 1); - btrfs_free_block_groups(fs_info); + btrfs_stop_all_workers(fs_info); + del_fs_roots(fs_info); - iput(fs_info->btree_inode); + free_root_pointers(fs_info, 1); - btrfs_stop_all_workers(fs_info); + iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) @@ -3651,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, INIT_LIST_HEAD(&splice); mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_splice_init(&t->ordered_operations, &splice); while (!list_empty(&splice)) { @@ -3659,11 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, ordered_operations); list_del_init(&btrfs_inode->ordered_operations); + spin_unlock(&root->fs_info->ordered_root_lock); btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->fs_info->ordered_root_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); mutex_unlock(&root->fs_info->ordered_operations_mutex); } @@ -3671,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); /* * This will just short circuit the ordered completion stuff which will * make sure the ordered extent gets properly cleaned up. */ - list_for_each_entry(ordered, &root->fs_info->ordered_extents, + list_for_each_entry(ordered, &root->ordered_extents, root_extent_list) set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->ordered_extent_lock); +} + +static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + list_del_init(&root->ordered_root); + + btrfs_destroy_ordered_extents(root); + + cond_resched_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); } int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -3701,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->root)) != NULL) { struct btrfs_delayed_ref_head *head = NULL; + bool pin_bytes = false; ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); atomic_set(&ref->refs, 1); @@ -3721,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, } if (head->must_insert_reserved) - btrfs_pin_extent(root, ref->bytenr, - ref->num_bytes, 1); + pin_bytes = true; btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) @@ -3733,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (head) - mutex_unlock(&head->mutex); spin_unlock(&delayed_refs->lock); + if (head) { + if (pin_bytes) + btrfs_pin_extent(root, ref->bytenr, + ref->num_bytes, 1); + mutex_unlock(&head->mutex); + } btrfs_put_delayed_ref(ref); cond_resched(); @@ -3772,21 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->delalloc_lock); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + btrfs_inode = list_first_entry(&splice, struct btrfs_inode, + delalloc_inodes); list_del_init(&btrfs_inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, &btrfs_inode->runtime_flags); + spin_unlock(&root->delalloc_lock); btrfs_invalidate_inodes(btrfs_inode->root); + + spin_lock(&root->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + list_del_init(&root->delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + spin_unlock(&fs_info->delalloc_root_lock); + + btrfs_destroy_delalloc_inodes(root); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->delalloc_root_lock); + } + spin_unlock(&fs_info->delalloc_root_lock); } static int btrfs_destroy_marked_extents(struct btrfs_root *root, @@ -3808,7 +3902,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, while (start <= end) { eb = btrfs_find_tree_block(root, start, root->leafsize); - start += eb->len; + start += root->leafsize; if (!eb) continue; wait_on_extent_buffer_writeback(eb); @@ -3870,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, cur_trans->dirty_pages.dirty_bytes); - /* FIXME: cleanup wait for commit */ - cur_trans->in_commit = 1; - cur_trans->blocked = 1; + cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&root->fs_info->transaction_blocked_wait); btrfs_evict_pending_snapshots(cur_trans); - cur_trans->blocked = 0; + cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); - cur_trans->commit_done = 1; - wake_up(&cur_trans->commit_wait); - btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); @@ -3891,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + cur_trans->state =TRANS_STATE_COMPLETED; + wake_up(&cur_trans->commit_wait); + /* memset(cur_trans, 0, sizeof(*cur_trans)); kmem_cache_free(btrfs_transaction_cachep, cur_trans); @@ -3906,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) spin_lock(&root->fs_info->trans_lock); list_splice_init(&root->fs_info->trans_list, &list); - root->fs_info->trans_no_join = 1; + root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->trans_lock); while (!list_empty(&list)) { @@ -3914,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_ordered_operations(t, root); - btrfs_destroy_ordered_extents(root); + btrfs_destroy_all_ordered_extents(root->fs_info); btrfs_destroy_delayed_refs(t, root); - /* FIXME: cleanup wait for commit */ - t->in_commit = 1; - t->blocked = 1; + /* + * FIXME: cleanup wait for commit + * We needn't acquire the lock here, because we are during + * the umount, there is no other task which will change it. + */ + t->state = TRANS_STATE_COMMIT_START; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) wake_up(&root->fs_info->transaction_blocked_wait); btrfs_evict_pending_snapshots(t); - t->blocked = 0; + t->state = TRANS_STATE_UNBLOCKED; smp_mb(); if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); - t->commit_done = 1; - smp_mb(); - if (waitqueue_active(&t->commit_wait)) - wake_up(&t->commit_wait); - btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); - btrfs_destroy_delalloc_inodes(root); - - spin_lock(&root->fs_info->trans_lock); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->trans_lock); + btrfs_destroy_all_delalloc_inodes(root->fs_info); btrfs_destroy_marked_extents(root, &t->dirty_pages, EXTENT_DIRTY); @@ -3952,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); + t->state = TRANS_STATE_COMPLETED; + smp_mb(); + if (waitqueue_active(&t->commit_wait)) + wake_up(&t->commit_wait); + atomic_set(&t->use_count, 0); list_del_init(&t->list); memset(t, 0, sizeof(*t)); kmem_cache_free(btrfs_transaction_cachep, t); } - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 0; - spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->transaction_kthread_mutex); return 0; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index be69ce1..b71acd6e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, + struct btrfs_key *location); +int btrfs_init_fs_root(struct btrfs_root *root); +int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_root *root); void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); -void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root); +void btrfs_free_fs_root(struct btrfs_root *root); + +/* + * This function is used to grab the root, and avoid it is freed when we + * access it. But it doesn't ensure that the tree is not dropped. + * + * If you want to ensure the whole tree is safe, you should use + * fs_info->subvol_srcu + */ +static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) +{ + if (atomic_inc_not_zero(&root->refs)) + return root; + return NULL; +} + +static inline void btrfs_put_fs_root(struct btrfs_root *root) +{ + if (atomic_dec_and_test(&root->refs)) + kfree(root); +} + void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 81ee29e..4b86916 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, goto fail; } - if (btrfs_root_refs(&root->root_item) == 0) { - err = -ENOENT; - goto fail; - } - key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2305b5c..0236de7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -24,6 +24,7 @@ #include <linux/kthread.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/percpu_counter.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -2070,8 +2071,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY); + int metadata = !extent_op->is_data; if (trans->aborted) return 0; @@ -2086,11 +2086,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, key.objectid = node->bytenr; if (metadata) { - struct btrfs_delayed_tree_ref *tree_ref; - - tree_ref = btrfs_delayed_node_to_tree_ref(node); key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = tree_ref->level; + key.offset = extent_op->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = node->num_bytes; @@ -2530,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, return 0; } +static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +{ + u64 num_bytes; + + num_bytes = heads * (sizeof(struct btrfs_extent_item) + + sizeof(struct btrfs_extent_inline_ref)); + if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + num_bytes += heads * sizeof(struct btrfs_tree_block_info); + + /* + * We don't ever fill up leaves all the way so multiply by 2 just to be + * closer to what we're really going to want to ouse. + */ + return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *global_rsv; + u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; + u64 num_bytes; + int ret = 0; + + num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_heads = heads_to_leaves(root, num_heads); + if (num_heads > 1) + num_bytes += (num_heads - 1) * root->leafsize; + num_bytes <<= 1; + global_rsv = &root->fs_info->global_block_rsv; + + /* + * If we can't allocate any more chunks lets make sure we have _lots_ of + * wiggle room since running delayed refs can create more delayed refs. + */ + if (global_rsv->space_info->full) + num_bytes <<= 1; + + spin_lock(&global_rsv->lock); + if (global_rsv->reserved <= num_bytes) + ret = 1; + spin_unlock(&global_rsv->lock); + return ret; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2577,7 +2619,8 @@ progress: old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); if (old) { DEFINE_WAIT(__wait); - if (delayed_refs->num_entries < 16348) + if (delayed_refs->flushing || + !btrfs_should_throttle_delayed_refs(trans, root)) return 0; prepare_to_wait(&delayed_refs->wait, &__wait, @@ -2612,7 +2655,7 @@ again: while (1) { if (!(run_all || run_most) && - delayed_refs->num_heads_ready < 64) + !btrfs_should_throttle_delayed_refs(trans, root)) break; /* @@ -2633,6 +2676,7 @@ again: spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); atomic_dec(&delayed_refs->procs_running_refs); + wake_up(&delayed_refs->wait); return ret; } @@ -2719,7 +2763,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, - int is_data) + int level, int is_data) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2732,6 +2776,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->update_flags = 1; extent_op->update_key = 0; extent_op->is_data = is_data ? 1 : 0; + extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); @@ -3109,6 +3154,11 @@ again: WARN_ON(ret); if (i_size_read(inode) > 0) { + ret = btrfs_check_trunc_cache_free_space(root, + &root->fs_info->global_block_rsv); + if (ret) + goto out_put; + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) @@ -3308,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info *found; int i; int factor; + int ret; if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) @@ -3331,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; + ret = percpu_counter_init(&found->total_bytes_pinned, 0); + if (ret) { + kfree(found); + return ret; + } + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); @@ -3563,10 +3620,11 @@ alloc: } /* - * If we have less pinned bytes than we want to allocate then - * don't bother committing the transaction, it won't help us. + * If we don't have enough pinned space to deal with this + * allocation don't bother committing the transaction. */ - if (data_sinfo->bytes_pinned < bytes) + if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, + bytes) < 0) committed = 1; spin_unlock(&data_sinfo->lock); @@ -3575,6 +3633,7 @@ commit_trans: if (!committed && !atomic_read(&root->fs_info->open_ioctl_trans)) { committed = 1; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3607,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); + WARN_ON(data_sinfo->bytes_may_use < bytes); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 0); @@ -3884,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, unsigned long nr_pages) { struct super_block *sb = root->fs_info->sb; - int started; - /* If we can not start writeback, just sync all the delalloc file. */ - started = try_to_writeback_inodes_sb_nr(sb, nr_pages, - WB_REASON_FS_FREE_SPACE); - if (!started) { + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { /* * We needn't worry the filesystem going from r/w to r/o though * we don't acquire ->s_umount mutex, because the filesystem @@ -3897,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_inodes(root, 0); + btrfs_start_all_delalloc_inodes(root->fs_info, 0); if (!current->journal_info) - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); } } @@ -3929,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); return; } @@ -3957,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_all_ordered_extents(root->fs_info, 0); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -3995,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root, /* See if there is enough pinned space to make this reservation */ spin_lock(&space_info->lock); - if (space_info->bytes_pinned >= bytes) { + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes) >= 0) { spin_unlock(&space_info->lock); goto commit; } @@ -4010,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root, spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); - if (space_info->bytes_pinned + delayed_rsv->size < bytes) { + if (percpu_counter_compare(&space_info->total_bytes_pinned, + bytes - delayed_rsv->size) >= 0) { spin_unlock(&delayed_rsv->lock); spin_unlock(&space_info->lock); return -ENOSPC; @@ -4295,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + block_rsv_add_bytes(dest, num_bytes, 1); + return 0; +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -4562,6 +4648,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; update_global_block_rsv(fs_info); @@ -5026,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root, int factor; /* block accounting for super block */ - spin_lock(&info->delalloc_lock); + spin_lock(&info->delalloc_root_lock); old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_lock); + spin_unlock(&info->delalloc_root_lock); while (total) { cache = btrfs_lookup_block_group(info, bytenr); @@ -5185,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, return ret; } +static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + + block_group = btrfs_lookup_block_group(root->fs_info, start); + if (!block_group) + return -EINVAL; + + cache_block_group(block_group, 0); + caching_ctl = get_caching_control(block_group); + + if (!caching_ctl) { + /* Logic error */ + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + if (ret) + goto out_lock; + + num_bytes = (start + num_bytes) - + caching_ctl->progress; + start = caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + } +out_lock: + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + btrfs_put_block_group(block_group); + return ret; +} + +int btrfs_exclude_logged_extents(struct btrfs_root *log, + struct extent_buffer *eb) +{ + struct btrfs_file_extent_item *item; + struct btrfs_key key; + int found_type; + int i; + + if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) + return 0; + + for (i = 0; i < btrfs_header_nritems(eb); i++) { + btrfs_item_key_to_cpu(eb, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + if (found_type == BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + continue; + key.objectid = btrfs_file_extent_disk_bytenr(eb, item); + key.offset = btrfs_file_extent_disk_num_bytes(eb, item); + __exclude_logged_extent(log, key.objectid, key.offset); + } + + return 0; +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -5247,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; + struct btrfs_space_info *space_info; down_write(&fs_info->extent_commit_sem); @@ -5269,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->extent_commit_sem); + list_for_each_entry_rcu(space_info, &fs_info->space_info, list) + percpu_counter_set(&space_info->total_bytes_pinned, 0); + update_global_block_rsv(fs_info); } @@ -5366,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, return 0; } +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, + u64 owner, u64 root_objectid) +{ + struct btrfs_space_info *space_info; + u64 flags; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + } else { + flags = BTRFS_BLOCK_GROUP_DATA; + } + + space_info = __find_space_info(fs_info, flags); + BUG_ON(!space_info); /* Logic bug */ + percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); +} + + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -5586,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } + add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, + root_objectid); } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != @@ -5709,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 parent, int last_ref) { struct btrfs_block_group_cache *cache = NULL; + int pin = 1; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -5741,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + pin = 0; } out: + if (pin) + add_pinned_bytes(root->fs_info, buf->len, + btrfs_header_level(buf), + root->root_key.objectid); + /* * Deleting the buffer, clear the corrupt flag since it doesn't matter * anymore. @@ -5759,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); + /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. @@ -6556,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; - struct btrfs_caching_control *caching_ctl; - u64 start = ins->objectid; - u64 num_bytes = ins->offset; - - block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, 0); - caching_ctl = get_caching_control(block_group); - - if (!caching_ctl) { - BUG_ON(!block_group_cache_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - if (ret) - goto out; - } else { - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = add_excluded_extent(root, start, num_bytes); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - if (ret) - goto out_lock; - start = caching_ctl->progress; - num_bytes = ins->objectid + ins->offset - - caching_ctl->progress; - ret = add_excluded_extent(root, start, num_bytes); - } -out_lock: - mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); + /* + * Mixed block groups will exclude before processing the log so we only + * need to do the exlude dance if this fs isn't mixed. + */ + if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(root, ins->objectid, ins->offset); if (ret) - goto out; + return ret; } + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + if (!block_group) + return -EINVAL; + ret = btrfs_update_reserved_bytes(block_group, ins->offset, RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); -out: btrfs_put_block_group(block_group); return ret; } @@ -6651,51 +6823,51 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; + bool global_updated = false; block_rsv = get_block_rsv(trans, root); - if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve. - */ - if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - return ERR_PTR(ret); - } else if (ret) { - return ERR_PTR(ret); - } + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + update_global_block_rsv(root->fs_info); + goto again; } - ret = block_rsv_use_bytes(block_rsv, blocksize); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "btrfs: block rsv returned %d\n", ret); + } +try_reserve: + ret = reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; - if (ret && !block_rsv->failfast) { - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); - } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - return block_rsv; - } else if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; } - - return ERR_PTR(-ENOSPC); + return ERR_PTR(ret); } static void unuse_block_rsv(struct btrfs_fs_info *fs_info, @@ -6763,6 +6935,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, extent_op->update_key = 1; extent_op->update_flags = 1; extent_op->is_data = 0; + extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins.objectid, @@ -6934,7 +7107,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, - eb->len, flag, 0); + eb->len, flag, + btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -7378,7 +7552,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { - if (!for_reloc && btrfs_fs_closing(root->fs_info)) { + if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("btrfs: drop snapshot early exit\n"); err = -EAGAIN; goto out_end_trans; @@ -7441,8 +7615,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_find_last_root(tree_root, root->root_key.objectid, - NULL, NULL); + ret = btrfs_find_root(tree_root, &root->root_key, path, + NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, tree_root, ret); err = ret; @@ -7459,11 +7633,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (root->in_radix) { - btrfs_free_fs_root(tree_root->fs_info, root); + btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); - kfree(root); + btrfs_put_fs_root(root); } out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); @@ -7776,6 +7950,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + struct btrfs_trans_handle *trans; u64 min_free; u64 dev_min = 1; u64 dev_nr = 0; @@ -7862,6 +8037,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) do_div(min_free, dev_min); } + /* We need to do this so that we can look at pending chunks */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 dev_offset; @@ -7872,7 +8054,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free && !device->is_tgtdev_for_dev_replace) { - ret = find_free_dev_extent(device, min_free, + ret = find_free_dev_extent(trans, device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; @@ -7884,6 +8066,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) } } mutex_unlock(&root->fs_info->chunk_mutex); + btrfs_end_transaction(trans, root); out: btrfs_put_block_group(block_group); return ret; @@ -8026,6 +8209,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) dump_space_info(space_info, 0, 0); } } + percpu_counter_destroy(&space_info->total_bytes_pinned); list_del(&space_info->list); kfree(space_info); } @@ -8248,6 +8432,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, sizeof(item)); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + ret = btrfs_finish_chunk_alloc(trans, extent_root, + key.objectid, key.offset); + if (ret) + btrfs_abort_transaction(trans, extent_root, ret); } } @@ -8585,8 +8773,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { ret = cache_block_group(cache, 0); - if (!ret) - wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } + ret = wait_block_group_cache_done(cache); + if (ret) { + btrfs_put_block_group(cache); + break; + } } ret = btrfs_trim_block_group(cache, &group_trimmed, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 32d67a8..583d98b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -23,6 +23,7 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; +static struct bio_set *btrfs_bioset; #ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); @@ -76,10 +77,29 @@ void btrfs_leak_debug_check(void) kmem_cache_free(extent_buffer_cache, eb); } } + +#define btrfs_debug_check_extent_io_range(inode, start, end) \ + __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end)) +static inline void __btrfs_debug_check_extent_io_range(const char *caller, + struct inode *inode, u64 start, u64 end) +{ + u64 isize = i_size_read(inode); + + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + printk_ratelimited(KERN_DEBUG + "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + caller, + (unsigned long long)btrfs_ino(inode), + (unsigned long long)isize, + (unsigned long long)start, + (unsigned long long)end); + } +} #else #define btrfs_leak_debug_add(new, head) do {} while (0) #define btrfs_leak_debug_del(entry) do {} while (0) #define btrfs_leak_debug_check() do {} while (0) +#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif #define BUFFER_LRU_MAX 64 @@ -125,10 +145,20 @@ int __init extent_io_init(void) SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; + + btrfs_bioset = bioset_create(BIO_POOL_SIZE, + offsetof(struct btrfs_io_bio, bio)); + if (!btrfs_bioset) + goto free_buffer_cache; return 0; +free_buffer_cache: + kmem_cache_destroy(extent_buffer_cache); + extent_buffer_cache = NULL; + free_state_cache: kmem_cache_destroy(extent_state_cache); + extent_state_cache = NULL; return -ENOMEM; } @@ -145,6 +175,8 @@ void extent_io_exit(void) kmem_cache_destroy(extent_state_cache); if (extent_buffer_cache) kmem_cache_destroy(extent_buffer_cache); + if (btrfs_bioset) + bioset_free(btrfs_bioset); } void extent_io_tree_init(struct extent_io_tree *tree, @@ -509,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err; int clear = 0; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + + if (bits & EXTENT_DELALLOC) + bits |= EXTENT_NORESERVE; + if (delete) bits |= ~EXTENT_CTLBITS; bits |= EXTENT_FIRST_DELALLOC; @@ -664,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct rb_node *node; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + spin_lock(&tree->lock); again: while (1) { @@ -756,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { @@ -976,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); + again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -1948,28 +1991,6 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) } /* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static void check_page_locked(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) - unlock_page(page); -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static void check_page_writeback(struct extent_io_tree *tree, - struct page *page) -{ - end_page_writeback(page); -} - -/* * When IO fails, either with EIO or csum verification fails, we * try other mirrors that might have a good copy of the data. This * io_failure_record is used to record state as we go through all the @@ -2046,7 +2067,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) return 0; - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_private = &compl; @@ -2336,7 +2357,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { free_io_failure(inode, failrec, 0); return -EIO; @@ -2398,19 +2419,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; do { struct page *page = bvec->bv_page; tree = &BTRFS_I(page->mapping->host)->io_tree; - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) + printk("%s page write in btrfs with offset %u and length %u\n", + bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE + ? KERN_ERR "partial" : KERN_INFO "incomplete", + bvec->bv_offset, bvec->bv_len); - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); @@ -2418,10 +2444,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (end_extent_writepage(page, err, start, end)) continue; - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); + end_page_writeback(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); @@ -2446,7 +2469,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_io_tree *tree; u64 start; u64 end; - int whole_page; int mirror; int ret; @@ -2457,19 +2479,27 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct page *page = bvec->bv_page; struct extent_state *cached = NULL; struct extent_state *state; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%ld\n", (u64)bio->bi_sector, err, - (long int)bio->bi_bdev); - tree = &BTRFS_I(page->mapping->host)->io_tree; - - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; + "mirror=%lu\n", (u64)bio->bi_sector, err, + io_bio->mirror_num); + tree = &BTRFS_I(inode)->io_tree; + + /* We always issue full-page reads, but if some block + * in a page fails to read, blk_update_request() will + * advance bv_offset and adjust bv_len to compensate. + * Print a warning for nonzero offsets, and an error + * if they don't add up to a full page. */ + if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) + printk("%s page read in btrfs with offset %u and length %u\n", + bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE + ? KERN_ERR "partial" : KERN_INFO "incomplete", + bvec->bv_offset, bvec->bv_len); + + start = page_offset(page); + end = start + bvec->bv_offset + bvec->bv_len - 1; if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); @@ -2485,7 +2515,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } spin_unlock(&tree->lock); - mirror = (int)(unsigned long)bio->bi_bdev; + mirror = io_bio->mirror_num; if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); @@ -2528,39 +2558,43 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); - if (whole_page) { - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); + if (uptodate) { + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + /* Zero out the end if this page straddles i_size */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index == end_index && offset) + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + SetPageUptodate(page); } else { - if (uptodate) { - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - check_page_locked(tree, page); + ClearPageUptodate(page); + SetPageError(page); } + unlock_page(page); } while (bvec <= bvec_end); bio_put(bio); } +/* + * this allocates from the btrfs_bioset. We're returning a bio right now + * but you can call btrfs_io_bio for the appropriate container_of magic + */ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags) { struct bio *bio; - bio = bio_alloc(gfp_flags, nr_vecs); + bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); + while (!bio && (nr_vecs /= 2)) { + bio = bio_alloc_bioset(gfp_flags, + nr_vecs, btrfs_bioset); + } } if (bio) { @@ -2571,6 +2605,19 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) +{ + return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); +} + + +/* this also allocates from the btrfs_bioset */ +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); +} + + static int __must_check submit_one_bio(int rw, struct bio *bio, int mirror_num, unsigned long bio_flags) { @@ -2949,7 +2996,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); return 0; } @@ -3988,7 +4035,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, @@ -4075,7 +4122,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, out_free: free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state, GFP_NOFS); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a2c03a1..3b8c4e2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -19,6 +19,7 @@ #define EXTENT_FIRST_DELALLOC (1 << 12) #define EXTENT_NEED_WAIT (1 << 13) #define EXTENT_DAMAGED (1 << 14) +#define EXTENT_NORESERVE (1 << 15) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -336,6 +337,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); +struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); +struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); struct btrfs_fs_info; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b193bf3..a7bfc95 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,8 +34,7 @@ #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)->sectorsize - (r)->sectorsize) + sizeof(u32) * (r)->sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_csum_item *item; LIST_HEAD(tmplist); unsigned long offset; @@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); + MAX_ORDERED_SUM_BYTES(root)); sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); + GFP_NOFS); if (!sums) { ret = -ENOMEM; goto fail; } - sector_sum = sums->sums; sums->bytenr = start; - sums->len = size; + sums->len = (int)size; offset = (start - key.offset) >> root->fs_info->sb->s_blocksize_bits; offset *= csum_size; + size >>= root->fs_info->sb->s_blocksize_bits; - while (size > 0) { - read_extent_buffer(path->nodes[0], - §or_sum->sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum->bytenr = start; - - size -= root->sectorsize; - start += root->sectorsize; - offset += csum_size; - sector_sum++; - } + read_extent_buffer(path->nodes[0], + sums->sums, + ((unsigned long)item) + offset, + csum_size * size); + + start += root->sectorsize * size; list_add_tail(&sums->list, &tmplist); } path->slots[0]++; @@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; char *data; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; + int index; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; - u64 disk_bytenr; WARN_ON(bio->bi_vcnt <= 0); sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); if (!sums) return -ENOMEM; - sector_sum = sums->sums; - disk_bytenr = (u64)bio->bi_sector << 9; sums->len = bio->bi_size; INIT_LIST_HEAD(&sums->list); @@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ordered->start; + sums->bytenr = (u64)bio->bi_sector << 9; + index = 0; while (bio_index < bio->bi_vcnt) { if (!contig) @@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), GFP_NOFS); BUG_ON(!sums); /* -ENOMEM */ - sector_sum = sums->sums; sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ordered->start; + sums->bytenr = ((u64)bio->bi_sector << 9) + + total_bytes; + index = 0; } data = kmap_atomic(bvec->bv_page); - sector_sum->sum = ~(u32)0; - sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset, - sector_sum->sum, - bvec->bv_len); + sums->sums[index] = ~(u32)0; + sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset, + sums->sums[index], + bvec->bv_len); kunmap_atomic(data); - btrfs_csum_final(sector_sum->sum, - (char *)§or_sum->sum); - sector_sum->bytenr = disk_bytenr; + btrfs_csum_final(sums->sums[index], + (char *)(sums->sums + index)); - sector_sum++; bio_index++; + index++; total_bytes += bvec->bv_len; this_sum_bytes += bvec->bv_len; - disk_bytenr += bvec->bv_len; offset += bvec->bv_len; bvec++; } @@ -672,62 +661,46 @@ out: return ret; } -static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, - struct btrfs_sector_sum *sector_sum, - u64 total_bytes, u64 sectorsize) -{ - u64 tmp = sectorsize; - u64 next_sector = sector_sum->bytenr; - struct btrfs_sector_sum *next = sector_sum + 1; - - while ((tmp + total_bytes) < sums->len) { - if (next_sector + sectorsize != next->bytenr) - break; - tmp += sectorsize; - next_sector = next->bytenr; - next++; - } - return tmp; -} - int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) { - u64 bytenr; - int ret; struct btrfs_key file_key; struct btrfs_key found_key; - u64 next_offset; - u64 total_bytes = 0; - int found_next; struct btrfs_path *path; struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; + u64 next_offset; + u64 total_bytes = 0; u64 csum_offset; - struct btrfs_sector_sum *sector_sum; + u64 bytenr; u32 nritems; u32 ins_size; + int index = 0; + int found_next; + int ret; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - - sector_sum = sums->sums; again: next_offset = (u64)-1; found_next = 0; + bytenr = sums->bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = sector_sum->bytenr; - bytenr = sector_sum->bytenr; + file_key.offset = bytenr; btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); - item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { - leaf = path->nodes[0]; ret = 0; + leaf = path->nodes[0]; + item_end = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); goto found; } ret = PTR_ERR(item); @@ -807,8 +780,7 @@ again: free_space = btrfs_leaf_free_space(root, leaf) - sizeof(struct btrfs_item) - csum_size; - tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, - root->sectorsize); + tmp = sums->len - total_bytes; tmp >>= root->fs_info->sb->s_blocksize_bits; WARN_ON(tmp < 1); @@ -822,6 +794,7 @@ again: diff *= csum_size; btrfs_extend_item(root, path, diff); + ret = 0; goto csum; } @@ -831,8 +804,7 @@ insert: if (found_next) { u64 tmp; - tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, - root->sectorsize); + tmp = sums->len - total_bytes; tmp >>= root->fs_info->sb->s_blocksize_bits; tmp = min(tmp, (next_offset - file_key.offset) >> root->fs_info->sb->s_blocksize_bits); @@ -853,31 +825,25 @@ insert: WARN_ON(1); goto fail_unlock; } -csum: leaf = path->nodes[0]; +csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - ret = 0; + item_end = (struct btrfs_csum_item *)((unsigned char *)item + + btrfs_item_size_nr(leaf, path->slots[0])); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: - item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + - btrfs_item_size_nr(leaf, path->slots[0])); -next_sector: - - write_extent_buffer(leaf, §or_sum->sum, (unsigned long)item, csum_size); - - total_bytes += root->sectorsize; - sector_sum++; - if (total_bytes < sums->len) { - item = (struct btrfs_csum_item *)((char *)item + - csum_size); - if (item < item_end && bytenr + PAGE_CACHE_SIZE == - sector_sum->bytenr) { - bytenr = sector_sum->bytenr; - goto next_sector; - } - } + ins_size = (u32)(sums->len - total_bytes) >> + root->fs_info->sb->s_blocksize_bits; + ins_size *= csum_size; + ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, + ins_size); + write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, + ins_size); + + ins_size /= csum_size; + total_bytes += ins_size * root->sectorsize; + index += ins_size; btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4205ba7..a005fe2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, ret = PTR_ERR(inode_root); goto cleanup; } - if (btrfs_root_refs(&inode_root->root_item) == 0) { - ret = -ENOENT; - goto cleanup; - } key.objectid = defrag->ino; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); @@ -1317,6 +1313,56 @@ fail: } +static noinline int check_can_nocow(struct inode *inode, loff_t pos, + size_t *write_bytes) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + u64 num_bytes; + int ret; + + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; + + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) { + break; + } + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + return PTR_ERR(trans); + } + + num_bytes = lockend - lockstart + 1; + ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL, + NULL); + btrfs_end_transaction(trans, root); + if (ret <= 0) { + ret = 0; + } else { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, + NULL, GFP_NOFS); + *write_bytes = min_t(size_t, *write_bytes, num_bytes); + } + + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); + + return ret; +} + static noinline ssize_t __btrfs_buffered_write(struct file *file, struct iov_iter *i, loff_t pos) @@ -1324,10 +1370,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; + u64 release_bytes = 0; unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; + bool only_release_metadata = false; bool force_page_uptodate = false; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / @@ -1348,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, offset); size_t num_pages = (write_bytes + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1362,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, break; } - ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = btrfs_check_data_free_space(inode, reserve_bytes); + if (ret == -ENOSPC && + (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC))) { + ret = check_can_nocow(inode, pos, &write_bytes); + if (ret > 0) { + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + ret = 0; + } else { + ret = -ENOSPC; + } + } + if (ret) break; + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); + if (ret) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, + reserve_bytes); + break; + } + + release_bytes = reserve_bytes; + /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the @@ -1375,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, ret = prepare_pages(root, file, pages, num_pages, pos, first_index, write_bytes, force_page_uptodate); - if (ret) { - btrfs_delalloc_release_space(inode, - num_pages << PAGE_CACHE_SHIFT); + if (ret) break; - } copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, i); @@ -1409,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * managed to copy. */ if (num_pages > dirty_pages) { + release_bytes = (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT; if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - btrfs_delalloc_release_space(inode, - (num_pages - dirty_pages) << - PAGE_CACHE_SHIFT); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, + release_bytes); + else + btrfs_delalloc_release_space(inode, + release_bytes); } + release_bytes = dirty_pages << PAGE_CACHE_SHIFT; if (copied > 0) { ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); if (ret) { - btrfs_delalloc_release_space(inode, - dirty_pages << PAGE_CACHE_SHIFT); btrfs_drop_pages(pages, num_pages); break; } } + release_bytes = 0; btrfs_drop_pages(pages, num_pages); + if (only_release_metadata && copied > 0) { + u64 lockstart = round_down(pos, root->sectorsize); + u64 lockend = lockstart + + (dirty_pages << PAGE_CACHE_SHIFT) - 1; + + set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_NORESERVE, NULL, + NULL, GFP_NOFS); + only_release_metadata = false; + } + cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1445,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, kfree(pages); + if (release_bytes) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, release_bytes); + else + btrfs_delalloc_release_space(inode, release_bytes); + } + return num_written ? num_written : ret; } @@ -2175,12 +2274,6 @@ static long btrfs_fallocate(struct file *file, int mode, goto out_reserve_fail; } - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); if (ret) @@ -2191,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start); if (ret) goto out; + } else { + /* + * If we are fallocating from the end of the file onward we + * need to zero out the end of the page if i_size lands in the + * middle of a page. + */ + ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); + if (ret) + goto out; } + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -2425,20 +2533,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) } } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { - offset = -EINVAL; - goto out; - } - if (offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; - goto out; - } - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); return offset; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ecca6c7..b21a3cd 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -197,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root, block_group->key.objectid); } -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) { - struct btrfs_block_rsv *rsv; u64 needed_bytes; - loff_t oldsize; - int ret = 0; - - rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + int ret; /* 1 for slack space, 1 for updating the inode */ needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + btrfs_calc_trans_metadata_size(root, 1); - spin_lock(&trans->block_rsv->lock); - if (trans->block_rsv->reserved < needed_bytes) { - spin_unlock(&trans->block_rsv->lock); - trans->block_rsv = rsv; - return -ENOSPC; - } - spin_unlock(&trans->block_rsv->lock); + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return ret; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + loff_t oldsize; + int ret = 0; oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -232,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - trans->block_rsv = rsv; btrfs_abort_transaction(trans, root, ret); return ret; } @@ -242,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, ret = btrfs_update_inode(trans, root, inode); if (ret) btrfs_abort_transaction(trans, root, ret); - trans->block_rsv = rsv; return ret; } @@ -920,10 +919,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Make sure we can fit our crcs into the first page */ if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { - WARN_ON(1); + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) goto out_nospc; - } io_ctl_set_generation(&io_ctl, trans->transid); @@ -3153,6 +3150,8 @@ again: return 0; } +#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__) + /* * This test just does basic sanity checking, making sure we can add an exten * entry and remove space from either end and the middle, and make sure we can @@ -3162,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache) { int ret = 0; - printk(KERN_ERR "Running extent only tests\n"); + test_msg("Running extent only tests\n"); /* First just make sure we can remove an entire entry */ ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Error adding initial extents %d\n", ret); + test_msg("Error adding initial extents %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Error removing extent %d\n", ret); + test_msg("Error removing extent %d\n", ret); return ret; } if (check_exists(cache, 0, 4 * 1024 * 1024)) { - printk(KERN_ERR "Full remove left some lingering space\n"); + test_msg("Full remove left some lingering space\n"); return -1; } /* Ok edge and middle cases now */ ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Error adding half extent %d\n", ret); + test_msg("Error adding half extent %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Error removing tail end %d\n", ret); + test_msg("Error removing tail end %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Error removing front end %d\n", ret); + test_msg("Error removing front end %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); if (ret) { - printk(KERN_ERR "Error removing middle peice %d\n", ret); + test_msg("Error removing middle piece %d\n", ret); return ret; } if (check_exists(cache, 0, 1 * 1024 * 1024)) { - printk(KERN_ERR "Still have space at the front\n"); + test_msg("Still have space at the front\n"); return -1; } if (check_exists(cache, 2 * 1024 * 1024, 4096)) { - printk(KERN_ERR "Still have space in the middle\n"); + test_msg("Still have space in the middle\n"); return -1; } if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { - printk(KERN_ERR "Still have space at the end\n"); + test_msg("Still have space at the end\n"); return -1; } @@ -3233,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) u64 next_bitmap_offset; int ret; - printk(KERN_ERR "Running bitmap only tests\n"); + test_msg("Running bitmap only tests\n"); ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); if (ret) { - printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret); + test_msg("Couldn't create a bitmap entry %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Error removing bitmap full range %d\n", ret); + test_msg("Error removing bitmap full range %d\n", ret); return ret; } if (check_exists(cache, 0, 4 * 1024 * 1024)) { - printk(KERN_ERR "Left some space in bitmap\n"); + test_msg("Left some space in bitmap\n"); return -1; } ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); if (ret) { - printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret); + test_msg("Couldn't add to our bitmap entry %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret); + test_msg("Couldn't remove middle chunk %d\n", ret); return ret; } @@ -3274,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) ret = add_free_space_entry(cache, next_bitmap_offset - (2 * 1024 * 1024), 4 * 1024 * 1024, 1); if (ret) { - printk(KERN_ERR "Couldn't add space that straddles two bitmaps" - " %d\n", ret); + test_msg("Couldn't add space that straddles two bitmaps %d\n", + ret); return ret; } ret = btrfs_remove_free_space(cache, next_bitmap_offset - (1 * 1024 * 1024), 2 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); + test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), 2 * 1024 * 1024)) { - printk(KERN_ERR "Left some space when removing overlapping\n"); + test_msg("Left some space when removing overlapping\n"); return -1; } @@ -3303,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); int ret; - printk(KERN_ERR "Running bitmap and extent tests\n"); + test_msg("Running bitmap and extent tests\n"); /* * First let's do something simple, an extent at the same offset as the @@ -3312,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) */ ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); if (ret) { - printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret); + test_msg("Couldn't create bitmap entry %d\n", ret); return ret; } ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); if (ret) { - printk(KERN_ERR "Couldn't add extent entry %d\n", ret); + test_msg("Couldn't add extent entry %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Couldn't remove extent entry %d\n", ret); + test_msg("Couldn't remove extent entry %d\n", ret); return ret; } if (check_exists(cache, 0, 1 * 1024 * 1024)) { - printk(KERN_ERR "Left remnants after our remove\n"); + test_msg("Left remnants after our remove\n"); return -1; } /* Now to add back the extent entry and remove from the bitmap */ ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); if (ret) { - printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret); + test_msg("Couldn't re-add extent entry %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret); + test_msg("Couldn't remove from bitmap %d\n", ret); return ret; } if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { - printk(KERN_ERR "Left remnants in the bitmap\n"); + test_msg("Left remnants in the bitmap\n"); return -1; } @@ -3357,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) */ ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); if (ret) { - printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret); + test_msg("Couldn't add to a bitmap %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); + test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { - printk(KERN_ERR "Left over peices after removing " - "overlapping\n"); + test_msg("Left over peices after removing overlapping\n"); return -1; } @@ -3378,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) /* Now with the extent entry offset into the bitmap */ ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); if (ret) { - printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret); + test_msg("Couldn't add space to the bitmap %d\n", ret); return ret; } ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); if (ret) { - printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret); + test_msg("Couldn't add extent to the cache %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Problem removing overlapping space %d\n", ret); + test_msg("Problem removing overlapping space %d\n", ret); return ret; } if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { - printk(KERN_ERR "Left something behind when removing space"); + test_msg("Left something behind when removing space"); return -1; } @@ -3413,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, 4 * 1024 * 1024, 1); if (ret) { - printk(KERN_ERR "Couldn't add bitmap %d\n", ret); + test_msg("Couldn't add bitmap %d\n", ret); return ret; } ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, 5 * 1024 * 1024, 0); if (ret) { - printk(KERN_ERR "Couldn't add extent entry %d\n", ret); + test_msg("Couldn't add extent entry %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, 5 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Failed to free our space %d\n", ret); + test_msg("Failed to free our space %d\n", ret); return ret; } if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, 5 * 1024 * 1024)) { - printk(KERN_ERR "Left stuff over\n"); + test_msg("Left stuff over\n"); return -1; } @@ -3447,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) */ ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); if (ret) { - printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret); + test_msg("Couldn't add bitmap entry %d\n", ret); return ret; } ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); if (ret) { - printk(KERN_ERR "Couldn't add extent entry %d\n", ret); + test_msg("Couldn't add extent entry %d\n", ret); return ret; } ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); if (ret) { - printk(KERN_ERR "Error removing bitmap and extent " - "overlapping %d\n", ret); + test_msg("Error removing bitmap and extent overlapping %d\n", ret); return ret; } @@ -3472,11 +3469,11 @@ void btrfs_test_free_space_cache(void) { struct btrfs_block_group_cache *cache; - printk(KERN_ERR "Running btrfs free space cache tests\n"); + test_msg("Running btrfs free space cache tests\n"); cache = init_test_block_group(); if (!cache) { - printk(KERN_ERR "Couldn't run the tests\n"); + test_msg("Couldn't run the tests\n"); return; } @@ -3490,6 +3487,9 @@ out: __btrfs_remove_free_space_cache(cache->free_space_ctl); kfree(cache->free_space_ctl); kfree(cache); - printk(KERN_ERR "Free space cache tests finished\n"); + test_msg("Free space cache tests finished\n"); } -#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ +#undef test_msg +#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ +void btrfs_test_free_space_cache(void) {} +#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 4dc17d8..894116b7 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); +int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_path *path, @@ -111,8 +113,6 @@ int btrfs_return_cluster_to_free_space( int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_free_space_cache(void); -#endif #endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d26f67a..2c66ddb 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root, num_bytes = trans->bytes_reserved; /* * 1 item for inode item insertion if need - * 3 items for inode item update (in the worst case) + * 4 items for inode item update (in the worst case) + * 1 items for slack space if we need do truncation * 1 item for free space object * 3 items for pre-allocation */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10); ret = btrfs_block_rsv_add(root, trans->block_rsv, trans->bytes_reserved, BTRFS_RESERVE_NO_FLUSH); @@ -468,7 +469,8 @@ again: if (i_size_read(inode) > 0) { ret = btrfs_truncate_free_space_cache(root, trans, path, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret != -ENOSPC) + btrfs_abort_transaction(trans, root, ret); goto out_put; } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9b31b3b..6d1b93c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -42,6 +42,7 @@ #include <linux/mount.h> #include <linux/btrfs.h> #include <linux/blkdev.h> +#include <linux/posix_acl_xattr.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -57,6 +58,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "backref.h" +#include "hash.h" struct btrfs_iget_args { u64 ino; @@ -701,8 +703,12 @@ retry: async_extent->nr_pages = 0; async_extent->pages = NULL; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + unlock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1); goto retry; + } goto out_free; } @@ -715,8 +721,10 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(); - if (!em) + if (!em) { + ret = -ENOMEM; goto out_free_reserve; + } em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -923,8 +931,10 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, } em = alloc_extent_map(); - if (!em) + if (!em) { + ret = -ENOMEM; goto out_reserve; + } em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -1525,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } +static void btrfs_add_delalloc_inodes(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes++; + if (root->nr_delalloc_inodes == 1) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(!list_empty(&root->delalloc_root)); + list_add_tail(&root->delalloc_root, + &root->fs_info->delalloc_roots); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct inode *inode) +{ + spin_lock(&root->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + root->nr_delalloc_inodes--; + if (!root->nr_delalloc_inodes) { + spin_lock(&root->fs_info->delalloc_root_lock); + BUG_ON(list_empty(&root->delalloc_root)); + list_del_init(&root->delalloc_root); + spin_unlock(&root->fs_info->delalloc_root_lock); + } + } + spin_unlock(&root->delalloc_lock); +} + /* * extent_io.c set_bit_hook, used to track delayed allocation * bytes in this file, and to maintain the list of inodes that @@ -1557,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); - } - spin_unlock(&root->fs_info->delalloc_lock); - } + &BTRFS_I(inode)->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1600,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list) + && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, @@ -1609,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); - } - spin_unlock(&root->fs_info->delalloc_lock); - } + &BTRFS_I(inode)->runtime_flags)) + btrfs_del_delalloc_inode(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } } @@ -2259,11 +2294,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path, return 0; return PTR_ERR(root); } - if (btrfs_root_refs(&root->root_item) == 0) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - /* parse ENOENT to 0 */ - return 0; - } /* step 2: get inode */ key.objectid = backref->inum; @@ -3211,13 +3241,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* 1 for the orphan item deletion. */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { + iput(inode); ret = PTR_ERR(trans); goto out; } ret = btrfs_orphan_add(trans, inode); btrfs_end_transaction(trans, root); - if (ret) + if (ret) { + iput(inode); goto out; + } ret = btrfs_truncate(inode); if (ret) @@ -3270,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; + static u64 xattr_access = 0; + static u64 xattr_default = 0; int scanned = 0; + if (!xattr_access) { + xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS)); + xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT)); + } + slot++; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -3281,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, return 0; /* we found an xattr, assume we've got an acl */ - if (found_key.type == BTRFS_XATTR_ITEM_KEY) - return 1; + if (found_key.type == BTRFS_XATTR_ITEM_KEY) { + if (found_key.offset == xattr_access || + found_key.offset == xattr_default) + return 1; + } /* * we found a key greater than an xattr key, there can't @@ -3656,53 +3701,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, } return ret; } - - -/* helper to check if there is any shared block in the path */ -static int check_path_shared(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct extent_buffer *eb; - int level; - u64 refs = 1; - - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - int ret; - - if (!path->nodes[level]) - break; - eb = path->nodes[level]; - if (!btrfs_block_can_be_shared(root, eb)) - continue; - ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1, - &refs, NULL); - if (refs > 1) - return 1; - } - return 0; -} /* * helper to start transaction for unlink and rmdir. * - * unlink and rmdir are special in btrfs, they do not always free space. - * so in enospc case, we should make sure they will free space before - * allowing them to use the global metadata reservation. + * unlink and rmdir are special in btrfs, they do not always free space, so + * if we cannot make our reservations the normal way try and see if there is + * plenty of slack room in the global reserve to migrate, otherwise we cannot + * allow the unlink to occur. */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, - struct dentry *dentry) +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct btrfs_dir_item *di; - struct inode *inode = dentry->d_inode; - u64 index; - int check_link = 1; - int err = -ENOSPC; int ret; - u64 ino = btrfs_ino(inode); - u64 dir_ino = btrfs_ino(dir); /* * 1 for the possible orphan item @@ -3715,158 +3727,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; - if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) - return ERR_PTR(-ENOSPC); - - /* check if there is someone else holds reference */ - if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) - return ERR_PTR(-ENOSPC); - - if (atomic_read(&inode->i_count) > 2) - return ERR_PTR(-ENOSPC); - - if (xchg(&root->fs_info->enospc_unlink, 1)) - return ERR_PTR(-ENOSPC); - - path = btrfs_alloc_path(); - if (!path) { - root->fs_info->enospc_unlink = 0; - return ERR_PTR(-ENOMEM); - } - - /* 1 for the orphan item */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_free_path(path); - root->fs_info->enospc_unlink = 0; - return trans; - } - - path->skip_locking = 1; - path->search_commit_root = 1; + if (PTR_ERR(trans) == -ENOSPC) { + u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(dir)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(inode)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); - - if (ret == 0 && S_ISREG(inode->i_mode)) { - ret = btrfs_lookup_file_extent(trans, root, path, - ino, (u64)-1, 0); - if (ret < 0) { - err = ret; - goto out; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return trans; + ret = btrfs_cond_migrate_bytes(root->fs_info, + &root->fs_info->trans_block_rsv, + num_bytes, 5); + if (ret) { + btrfs_end_transaction(trans, root); + return ERR_PTR(ret); } - BUG_ON(ret == 0); /* Corruption */ - if (check_path_shared(root, path)) - goto out; - btrfs_release_path(path); - } - - if (!check_link) { - err = 0; - goto out; - } - - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - if (di) { - if (check_path_shared(root, path)) - goto out; - } else { - err = 0; - goto out; - } - btrfs_release_path(path); - - ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name, - dentry->d_name.len, ino, dir_ino, 0, - &index); - if (ret) { - err = ret; - goto out; - } - - if (check_path_shared(root, path)) - goto out; - - btrfs_release_path(path); - - /* - * This is a commit root search, if we can lookup inode item and other - * relative items in the commit root, it means the transaction of - * dir/file creation has been committed, and the dir index item that we - * delay to insert has also been inserted into the commit root. So - * we needn't worry about the delayed insertion of the dir index item - * here. - */ - di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - BUG_ON(ret == -ENOENT); - if (check_path_shared(root, path)) - goto out; - - err = 0; -out: - btrfs_free_path(path); - /* Migrate the orphan reservation over */ - if (!err) - err = btrfs_block_rsv_migrate(trans->block_rsv, - &root->fs_info->global_block_rsv, - trans->bytes_reserved); - - if (err) { - btrfs_end_transaction(trans, root); - root->fs_info->enospc_unlink = 0; - return ERR_PTR(err); - } - - trans->block_rsv = &root->fs_info->global_block_rsv; - return trans; -} - -static void __unlink_end_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) { - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); trans->block_rsv = &root->fs_info->trans_block_rsv; - BUG_ON(!root->fs_info->enospc_unlink); - root->fs_info->enospc_unlink = 0; + trans->bytes_reserved = num_bytes; } - btrfs_end_transaction(trans, root); + return trans; } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -3876,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; int ret; - trans = __unlink_start_trans(dir, dentry); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3894,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } out: - __unlink_end_trans(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); return ret; } @@ -3991,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) return -EPERM; - trans = __unlink_start_trans(dir, dentry); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -4013,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (!err) btrfs_i_size_write(inode, 0); out: - __unlink_end_trans(trans, root); + btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); return err; @@ -4391,6 +4268,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) u64 hole_size; int err = 0; + /* + * If our size started in the middle of a page we need to zero out the + * rest of the page before we expand the i_size, otherwise we could + * expose stale data. + */ + err = btrfs_truncate_page(inode, oldsize, 0, 0); + if (err) + return err; + if (size <= hole_start) return 0; @@ -4724,6 +4610,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); no_delete: + btrfs_remove_delayed_node(inode); clear_inode(inode); return; } @@ -4817,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root, goto out; } - if (btrfs_root_refs(&new_root->root_item) == 0) { - err = -ENOENT; - goto out; - } - *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); location->type = BTRFS_INODE_ITEM_KEY; @@ -4839,14 +4721,13 @@ static void inode_tree_add(struct inode *inode) struct rb_node **p; struct rb_node *parent; u64 ino = btrfs_ino(inode); -again: - p = &root->inode_tree.rb_node; - parent = NULL; if (inode_unhashed(inode)) return; - +again: + parent = NULL; spin_lock(&root->inode_lock); + p = &root->inode_tree.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); @@ -5088,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (!(inode->i_sb->s_flags & MS_RDONLY)) ret = btrfs_orphan_cleanup(sub_root); up_read(&root->fs_info->cleanup_work_sem); - if (ret) + if (ret) { + iput(inode); inode = ERR_PTR(ret); + } } return inode; @@ -5133,10 +5016,9 @@ unsigned char btrfs_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int btrfs_real_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_item *item; struct btrfs_dir_item *di; @@ -5157,29 +5039,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, char tmp_name[32]; char *name_ptr; int name_len; - int is_curr = 0; /* filp->f_pos points to the current index? */ + int is_curr = 0; /* ctx->pos points to the current index? */ /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) key_type = BTRFS_DIR_ITEM_KEY; - /* special case for "." */ - if (filp->f_pos == 0) { - over = filldir(dirent, ".", 1, - filp->f_pos, btrfs_ino(inode), DT_DIR); - if (over) - return 0; - filp->f_pos = 1; - } - /* special case for .., just use the back ref */ - if (filp->f_pos == 1) { - u64 pino = parent_ino(filp->f_path.dentry); - over = filldir(dirent, "..", 2, - filp->f_pos, pino, DT_DIR); - if (over) - return 0; - filp->f_pos = 2; - } + if (!dir_emit_dots(file, ctx)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5193,7 +5061,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, } btrfs_set_key_type(&key, key_type); - key.offset = filp->f_pos; + key.offset = ctx->pos; key.objectid = btrfs_ino(inode); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -5219,14 +5087,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, break; if (btrfs_key_type(&found_key) != key_type) break; - if (found_key.offset < filp->f_pos) + if (found_key.offset < ctx->pos) goto next; if (key_type == BTRFS_DIR_INDEX_KEY && btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; - filp->f_pos = found_key.offset; + ctx->pos = found_key.offset; is_curr = 1; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); @@ -5270,9 +5138,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, over = 0; goto skip; } - over = filldir(dirent, name_ptr, name_len, - found_key.offset, location.objectid, - d_type); + over = !dir_emit(ctx, name_ptr, name_len, + location.objectid, d_type); skip: if (name_ptr != tmp_name) @@ -5291,9 +5158,8 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) - filp->f_pos++; - ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, - &ins_list); + ctx->pos++; + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); if (ret) goto nopos; } @@ -5304,9 +5170,9 @@ next: * 32-bit glibc will use getdents64, but then strtol - * so the last number we can serve is this. */ - filp->f_pos = 0x7fffffff; + ctx->pos = 0x7fffffff; else - filp->f_pos++; + ctx->pos++; nopos: ret = 0; err: @@ -6514,10 +6380,10 @@ out: * returns 1 when the nocow is safe, < 1 on error, 0 if the * block must be cow'd */ -static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, - struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes) +noinline int can_nocow_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes) { struct btrfs_path *path; int ret; @@ -6531,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, u64 num_bytes; int slot; int found_type; - + bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6571,18 +6437,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, /* not a regular extent, must cow */ goto out; } + + if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + goto out; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + goto out; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + backref_offset = btrfs_file_extent_offset(leaf, fi); - *orig_start = key.offset - backref_offset; - *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (orig_start) { + *orig_start = key.offset - backref_offset; + *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end < offset + *len) { - /* extent doesn't include our full range, must cow */ - goto out; - } if (btrfs_extent_readonly(root, disk_bytenr)) goto out; @@ -6826,8 +6702,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (IS_ERR(trans)) goto must_cow; - if (can_nocow_odirect(trans, inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + if (can_nocow_extent(trans, inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1) { if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, @@ -6928,7 +6804,11 @@ struct btrfs_dio_private { /* IO errors */ int errors; + /* orig_bio is our btrfs_io_bio */ struct bio *orig_bio; + + /* dio_bio came from fs/direct-io.c */ + struct bio *dio_bio; }; static void btrfs_endio_direct_read(struct bio *bio, int err) @@ -6938,6 +6818,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct bio_vec *bvec = bio->bi_io_vec; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; + struct bio *dio_bio; u64 start; start = dip->logical_offset; @@ -6977,14 +6858,15 @@ failed: unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); - bio->bi_private = dip->private; + dio_bio = dip->dio_bio; kfree(dip); /* If we had a csum failure make sure to clear the uptodate flag */ if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); } static void btrfs_endio_direct_write(struct bio *bio, int err) @@ -6995,6 +6877,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_ordered_extent *ordered = NULL; u64 ordered_offset = dip->logical_offset; u64 ordered_bytes = dip->bytes; + struct bio *dio_bio; int ret; if (err) @@ -7022,14 +6905,15 @@ out_test: goto again; } out_done: - bio->bi_private = dip->private; + dio_bio = dip->dio_bio; kfree(dip); /* If we had an error make sure to clear the uptodate flag */ if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_end_io(dio_bio, err); + bio_put(bio); } static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, @@ -7065,10 +6949,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (!atomic_dec_and_test(&dip->pending_bios)) goto out; - if (dip->errors) + if (dip->errors) { bio_io_error(dip->orig_bio); - else { - set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + } else { + set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); bio_endio(dip->orig_bio, 0); } out: @@ -7243,48 +7127,54 @@ out_err: return 0; } -static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, - loff_t file_offset) +static void btrfs_submit_direct(int rw, struct bio *dio_bio, + struct inode *inode, loff_t file_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; - struct bio_vec *bvec = bio->bi_io_vec; + struct bio *io_bio; int skip_sum; int write = rw & REQ_WRITE; int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); + + if (!io_bio) { + ret = -ENOMEM; + goto free_ordered; + } + dip = kmalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_ordered; + goto free_io_bio; } - dip->private = bio->bi_private; + dip->private = dio_bio->bi_private; dip->inode = inode; dip->logical_offset = file_offset; - - dip->bytes = 0; - do { - dip->bytes += bvec->bv_len; - bvec++; - } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); - - dip->disk_bytenr = (u64)bio->bi_sector << 9; - bio->bi_private = dip; + dip->bytes = dio_bio->bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; + io_bio->bi_private = dip; dip->errors = 0; - dip->orig_bio = bio; + dip->orig_bio = io_bio; + dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); if (write) - bio->bi_end_io = btrfs_endio_direct_write; + io_bio->bi_end_io = btrfs_endio_direct_write; else - bio->bi_end_io = btrfs_endio_direct_read; + io_bio->bi_end_io = btrfs_endio_direct_read; ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; + +free_io_bio: + bio_put(io_bio); + free_ordered: /* * If this is a write, we need to clean up the reserved space and kill @@ -7300,7 +7190,7 @@ free_ordered: btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } - bio_endio(bio, ret); + bio_endio(dio_bio, ret); } static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, @@ -7364,8 +7254,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, atomic_inc(&inode->i_dio_count); smp_mb__after_atomic_inc(); + /* + * The generic stuff only does filemap_write_and_wait_range, which isn't + * enough if we've written compressed pages to this area, so we need to + * call btrfs_wait_ordered_range to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + count = iov_length(iov, nr_segs); + btrfs_wait_ordered_range(inode, offset, count); + if (rw & WRITE) { - count = iov_length(iov, nr_segs); /* * If the write DIO is beyond the EOF, we need update * the isize, but it is protected by i_mutex. So we can @@ -7484,7 +7382,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } -static void btrfs_invalidatepage(struct page *page, unsigned long offset) +static void btrfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct extent_io_tree *tree; @@ -7684,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; - int ret; + int ret = 0; int err = 0; struct btrfs_trans_handle *trans; u64 mask = root->sectorsize - 1; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - ret = btrfs_truncate_page(inode, inode->i_size, 0, 0); - if (ret) - return ret; - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); @@ -7951,9 +7846,9 @@ void btrfs_destroy_inode(struct inode *inode) */ smp_mb(); if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); } if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, @@ -7979,7 +7874,6 @@ void btrfs_destroy_inode(struct inode *inode) inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: - btrfs_remove_delayed_node(inode); call_rcu(&inode->i_rcu, btrfs_i_callback); } @@ -7987,6 +7881,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + if (root == NULL) + return 1; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && root != root->fs_info->tree_root) @@ -8321,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { struct btrfs_inode *binode; struct inode *inode; @@ -8330,30 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) struct list_head splice; int ret = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); - spin_lock(&root->fs_info->delalloc_lock); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); + spin_lock(&root->delalloc_lock); + list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); - list_del_init(&binode->delalloc_inodes); - + list_move_tail(&binode->delalloc_inodes, + &root->delalloc_inodes); inode = igrab(&binode->vfs_inode); if (!inode) { - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &binode->runtime_flags); + cond_resched_lock(&root->delalloc_lock); continue; } - - list_add_tail(&binode->delalloc_inodes, - &root->fs_info->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); if (unlikely(!work)) { @@ -8365,16 +8255,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) &work->work); cond_resched(); - spin_lock(&root->fs_info->delalloc_lock); + spin_lock(&root->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&root->delalloc_lock); list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + return 0; +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->delalloc_lock); + list_splice_tail(&splice, &root->delalloc_inodes); + spin_unlock(&root->delalloc_lock); + } + return ret; +} - /* the filemap_flush will queue IO into the worker threads, but +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +{ + int ret; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + ret = __start_delalloc_inodes(root, delay_iput); + /* + * the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return */ @@ -8386,17 +8299,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); + return ret; +} + +int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info, + int delay_iput) +{ + struct btrfs_root *root; + struct list_head splice; + int ret; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->delalloc_root_lock); + list_splice_init(&fs_info->delalloc_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + delalloc_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->delalloc_root, + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); + + ret = __start_delalloc_inodes(root, delay_iput); + btrfs_put_fs_root(root); + if (ret) + goto out; + + spin_lock(&fs_info->delalloc_root_lock); } + spin_unlock(&fs_info->delalloc_root_lock); + atomic_inc(&fs_info->async_submit_draining); + while (atomic_read(&fs_info->nr_async_submits) || + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0 && + atomic_read(&fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&fs_info->async_submit_draining); + return 0; +out: if (!list_empty_careful(&splice)) { - spin_lock(&root->fs_info->delalloc_lock); - list_splice_tail(&splice, &root->fs_info->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); + spin_lock(&fs_info->delalloc_root_lock); + list_splice_tail(&splice, &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } return ret; } @@ -8703,7 +8654,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = { static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = btrfs_real_readdir, + .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de4a2f..238a055 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root->ref_cows) return -EINVAL; + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + + btrfs_wait_ordered_extents(root, 0); + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) return -ENOMEM; @@ -1801,7 +1807,11 @@ static noinline int copy_to_sk(struct btrfs_root *root, item_off = btrfs_item_ptr_offset(leaf, i); item_len = btrfs_item_size_nr(leaf, i); - if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE) item_len = 0; if (sizeof(sh) + item_len + *sk_offset > @@ -1810,10 +1820,6 @@ static noinline int copy_to_sk(struct btrfs_root *root, goto overflow; } - btrfs_item_key_to_cpu(leaf, key, i); - if (!key_in_sk(key, sk)) - continue; - sh.objectid = key->objectid; sh.offset = key->offset; sh.type = key->type; @@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - mnt_drop_write_file(file); - return -EINVAL; - } - - mutex_lock(&root->fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_rm_device(root, vol_args->name); - kfree(vol_args); -out: + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name); mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + +out: + kfree(vol_args); mnt_drop_write_file(file); return ret; } @@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, int ret; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; + int same_inode = 0; /* * TODO: @@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ret = -EINVAL; if (src == inode) - goto out_fput; + same_inode = 1; /* the src must be open for reading */ if (!(src_file.file->f_mode & FMODE_READ)) @@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } path->reada = 2; - if (inode < src) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + if (!same_inode) { + if (inode < src) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } } else { - mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + mutex_lock(&src->i_mutex); } /* determine range to clone */ @@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, !IS_ALIGNED(destoff, bs)) goto out_unlock; + /* verify if ranges are overlapped within the same file */ + if (same_inode) { + if (destoff + len > off && destoff < off + len) + goto out_unlock; + } + if (destoff > inode->i_size) { ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) @@ -2846,7 +2863,8 @@ out: unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); out_unlock: mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); + if (!same_inode) + mutex_unlock(&inode->i_mutex); vfree(buf); btrfs_free_path(path); out_fput: @@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) goto out; } - if (btrfs_root_refs(&new_root->root_item) == 0) { - ret = -ENOENT; - goto out; - } - path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) break; } - if (copy_to_user(arg, sa, sizeof(*sa))) - ret = -EFAULT; - err = btrfs_commit_transaction(trans, root->fs_info->tree_root); if (err && !ret) ret = err; @@ -3881,7 +3891,7 @@ drop_write: static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_quota_rescan_args *qsa; int ret; @@ -3914,7 +3924,7 @@ drop_write: static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_ioctl_quota_rescan_args *qsa; int ret = 0; @@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) return ret; } +static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_qgroup_wait_for_completion(root->fs_info); +} + static long btrfs_ioctl_set_received_subvol(struct file *file, void __user *arg) { @@ -4020,7 +4040,7 @@ out: static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; const char *label = root->fs_info->super_copy->label; size_t len = strnlen(label, BTRFS_LABEL_SIZE); int ret; @@ -4039,7 +4059,7 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_super_block *super_block = root->fs_info->super_copy; struct btrfs_trans_handle *trans; char label[BTRFS_LABEL_SIZE]; @@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_quota_rescan(file, argp); case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_WAIT: + return btrfs_ioctl_quota_rescan_wait(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); case BTRFS_IOC_GET_FSLABEL: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 743b86f..f93151a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -31,8 +31,8 @@ struct workspace { void *mem; - void *buf; /* where compressed data goes */ - void *cbuf; /* where decompressed data goes */ + void *buf; /* where decompressed data goes */ + void *cbuf; /* where compressed data goes */ struct list_head list; }; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1ddd728..8136982 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -24,6 +24,7 @@ #include "transaction.h" #include "btrfs_inode.h" #include "extent_io.h" +#include "disk-io.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int dio, int compress_type) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; @@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, ordered_data_tree_panic(inode, -EEXIST, file_offset); spin_unlock_irq(&tree->lock); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, - &BTRFS_I(inode)->root->fs_info->ordered_extents); - spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + &root->ordered_extents); + root->nr_ordered_extents++; + if (root->nr_ordered_extents == 1) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(!list_empty(&root->ordered_root)); + list_add_tail(&root->ordered_root, + &root->fs_info->ordered_roots); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); return 0; } @@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode, set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); spin_unlock_irq(&tree->lock); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); + root->nr_ordered_extents--; trace_btrfs_ordered_extent_remove(inode, entry); @@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { list_del_init(&BTRFS_I(inode)->ordered_operations); } - spin_unlock(&root->fs_info->ordered_extent_lock); + + if (!root->nr_ordered_extents) { + spin_lock(&root->fs_info->ordered_root_lock); + BUG_ON(list_empty(&root->ordered_root)); + list_del_init(&root->ordered_root); + spin_unlock(&root->fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); wake_up(&entry->wait); } @@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { struct list_head splice, works; - struct list_head *cur; struct btrfs_ordered_extent *ordered, *next; struct inode *inode; @@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_extents, &splice); + spin_lock(&root->ordered_extent_lock); + list_splice_init(&root->ordered_extents, &splice); while (!list_empty(&splice)) { - cur = splice.next; - ordered = list_entry(cur, struct btrfs_ordered_extent, - root_extent_list); - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); - + ordered = list_first_entry(&splice, struct btrfs_ordered_extent, + root_extent_list); + list_move_tail(&ordered->root_extent_list, + &root->ordered_extents); /* * the inode may be getting freed (in sys_unlink path). */ inode = igrab(ordered->inode); + if (!inode) { + cond_resched_lock(&root->ordered_extent_lock); + continue; + } - spin_unlock(&root->fs_info->ordered_extent_lock); + atomic_inc(&ordered->refs); + spin_unlock(&root->ordered_extent_lock); - if (inode) { - ordered->flush_work.func = btrfs_run_ordered_extent_work; - list_add_tail(&ordered->work_list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &ordered->flush_work); - } else { - btrfs_put_ordered_extent(ordered); - } + ordered->flush_work.func = btrfs_run_ordered_extent_work; + list_add_tail(&ordered->work_list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &ordered->flush_work); cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->ordered_extent_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->ordered_extent_lock); list_for_each_entry_safe(ordered, next, &works, work_list) { list_del_init(&ordered->work_list); @@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) mutex_unlock(&root->fs_info->ordered_operations_mutex); } +void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info, + int delay_iput) +{ + struct btrfs_root *root; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + spin_lock(&fs_info->ordered_root_lock); + list_splice_init(&fs_info->ordered_roots, &splice); + while (!list_empty(&splice)) { + root = list_first_entry(&splice, struct btrfs_root, + ordered_root); + root = btrfs_grab_fs_root(root); + BUG_ON(!root); + list_move_tail(&root->ordered_root, + &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + + btrfs_wait_ordered_extents(root, delay_iput); + btrfs_put_fs_root(root); + + spin_lock(&fs_info->ordered_root_lock); + } + spin_unlock(&fs_info->ordered_root_lock); +} + /* * this is used during transaction commit to write all the inodes * added to the ordered operation list. These files must be fully on @@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); list_splice_init(&cur_trans->ordered_operations, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, @@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, if (!wait) list_add_tail(&BTRFS_I(inode)->ordered_operations, &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); work = btrfs_alloc_delalloc_work(inode, wait, 1); if (!work) { - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) list_add_tail(&btrfs_inode->ordered_operations, &splice); list_splice_tail(&splice, &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); ret = -ENOMEM; goto out; } @@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, &work->work); cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); @@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len) { struct btrfs_ordered_sum *ordered_sum; - struct btrfs_sector_sum *sector_sums; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; @@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { i = (disk_bytenr - ordered_sum->bytenr) >> inode->i_sb->s_blocksize_bits; - sector_sums = ordered_sum->sums + i; num_sectors = ordered_sum->len >> inode->i_sb->s_blocksize_bits; - for (; i < num_sectors; i++) { - if (sector_sums[i].bytenr == disk_bytenr) { - sum[index] = sector_sums[i].sum; - index++; - if (index == len) - goto out; - disk_bytenr += sectorsize; - } - } + num_sectors = min_t(int, len - index, num_sectors - i); + memcpy(sum + index, ordered_sum->sums + i, + num_sectors); + + index += (int)num_sectors; + if (index == len) + goto out; + disk_bytenr += num_sectors * sectorsize; } } out: @@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, if (last_mod < root->fs_info->last_trans_committed) return; - spin_lock(&root->fs_info->ordered_extent_lock); + spin_lock(&root->fs_info->ordered_root_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, &cur_trans->ordered_operations); } - spin_unlock(&root->fs_info->ordered_extent_lock); + spin_unlock(&root->fs_info->ordered_root_lock); } int __init ordered_data_init(void) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 58b0e3b..68844d5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree { struct rb_node *last; }; -/* - * these are used to collect checksums done just before bios submission. - * They are attached via a list into the ordered extent, and - * checksum items are inserted into the tree after all the blocks in - * the ordered extent are on disk - */ -struct btrfs_sector_sum { - /* bytenr on disk */ - u64 bytenr; - u32 sum; -}; - struct btrfs_ordered_sum { /* bytenr is the start of this extent on disk */ u64 bytenr; @@ -45,10 +33,10 @@ struct btrfs_ordered_sum { /* * this is the length in bytes covered by the sums array below. */ - unsigned long len; + int len; struct list_head list; - /* last field is a variable length array of btrfs_sector_sums */ - struct btrfs_sector_sum sums[]; + /* last field is a variable length array of csums */ + u32 sums[]; }; /* @@ -149,11 +137,8 @@ struct btrfs_ordered_extent { static inline int btrfs_ordered_sum_size(struct btrfs_root *root, unsigned long bytes) { - unsigned long num_sectors = (bytes + root->sectorsize - 1) / - root->sectorsize; - num_sectors++; - return sizeof(struct btrfs_ordered_sum) + - num_sectors * sizeof(struct btrfs_sector_sum); + int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize); + return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32); } static inline void @@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info, + int delay_iput); void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9d49c58..1280eff 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -98,13 +98,10 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -struct qgroup_rescan { - struct btrfs_work work; - struct btrfs_fs_info *fs_info; -}; - -static void qgroup_rescan_start(struct btrfs_fs_info *fs_info, - struct qgroup_rescan *qscan); +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags); +static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); /* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, @@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) int slot; int ret = 0; u64 flags = 0; + u64 rescan_progress = 0; if (!fs_info->quota_enabled) return 0; + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); - fs_info->qgroup_rescan_progress.objectid = - btrfs_qgroup_status_rescan(l, ptr); - if (fs_info->qgroup_flags & - BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - struct qgroup_rescan *qscan = - kmalloc(sizeof(*qscan), GFP_NOFS); - if (!qscan) { - ret = -ENOMEM; - goto out; - } - fs_info->qgroup_rescan_progress.type = 0; - fs_info->qgroup_rescan_progress.offset = 0; - qgroup_rescan_start(fs_info, qscan); - } + rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -421,9 +412,18 @@ out: if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; + } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && + ret >= 0) { + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); } btrfs_free_path(path); + if (ret < 0) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + } + return ret < 0 ? ret : 0; } @@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) } kfree(qgroup); } + ulist_free(fs_info->qgroup_ulist); } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, @@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, goto out; } + fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + /* * initially create the quota tree */ @@ -916,6 +923,10 @@ out_free_root: kfree(quota_root); } out: + if (ret) { + ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; + } mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, u64 ref_root; struct btrfs_qgroup *qgroup; struct ulist *roots = NULL; - struct ulist *tmp = NULL; u64 seq; int ret = 0; int sgn; @@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { - ret = 0; - goto unlock; - } - } quota_root = fs_info->quota_root; if (!quota_root) @@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, /* * step 1: for each old ref, visit all nodes once and inc refcnt */ - tmp = ulist_alloc(GFP_ATOMIC); - if (!tmp) { - ret = -ENOMEM; - goto unlock; - } + ulist_reinit(fs_info->qgroup_ulist); seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); + ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, + seq); if (ret) goto unlock; /* * step 2: walk from the new root */ - ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn, - node->num_bytes, qgroup); + ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, + seq, sgn, node->num_bytes, qgroup); if (ret) goto unlock; /* * step 3: walk again from old refs */ - ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn, - node->num_bytes); + ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, + seq, sgn, node->num_bytes); if (ret) goto unlock; unlock: spin_unlock(&fs_info->qgroup_lock); - mutex_unlock(&fs_info->qgroup_rescan_lock); ulist_free(roots); - ulist_free(tmp); return ret; } @@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; if (!ret && start_rescan_worker) { - ret = btrfs_qgroup_rescan(fs_info); - if (ret) - pr_err("btrfs: start rescan quota failed: %d\n", ret); + ret = qgroup_rescan_init(fs_info, 0, 1); + if (!ret) { + qgroup_rescan_zero_tracking(fs_info); + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); + } ret = 0; } @@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; - struct ulist *ulist = NULL; struct ulist_node *unode; struct ulist_iterator uiter; @@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * in a first step, we check all affected qgroups if any limits would * be exceeded */ - ulist = ulist_alloc(GFP_ATOMIC); - if (!ulist) { - ret = -ENOMEM; - goto out; - } - ret = ulist_add(ulist, qgroup->qgroupid, + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); if (ret < 0) goto out; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; @@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) } list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(ulist, glist->group->qgroupid, + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, (uintptr_t)glist->group, GFP_ATOMIC); if (ret < 0) goto out; @@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) * no limits exceeded, now record the reservation into all qgroups */ ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; @@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(ulist); - return ret; } @@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; - struct ulist *ulist = NULL; struct ulist_node *unode; struct ulist_iterator uiter; u64 ref_root = root->root_key.objectid; @@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) if (!qgroup) goto out; - ulist = ulist_alloc(GFP_ATOMIC); - if (!ulist) { - btrfs_std_error(fs_info, -ENOMEM); - goto out; - } - ret = ulist_add(ulist, qgroup->qgroupid, + ulist_reinit(fs_info->qgroup_ulist); + ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); if (ret < 0) goto out; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(ulist, &uiter))) { + while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; @@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) qg->reserved -= num_bytes; list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(ulist, glist->group->qgroupid, + ret = ulist_add(fs_info->qgroup_ulist, + glist->group->qgroupid, (uintptr_t)glist->group, GFP_ATOMIC); if (ret < 0) goto out; @@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(ulist); } void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) @@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. */ static int -qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, +qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_trans_handle *trans, struct ulist *tmp, struct extent_buffer *scratch_leaf) { struct btrfs_key found; - struct btrfs_fs_info *fs_info = qscan->fs_info; struct ulist *roots = NULL; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2007,11 +1996,10 @@ out: static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) { - struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan, - work); + struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, + qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct btrfs_fs_info *fs_info = qscan->fs_info; struct ulist *tmp = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; @@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) if (!fs_info->quota_enabled) { err = -EINTR; } else { - err = qgroup_rescan_leaf(qscan, path, trans, + err = qgroup_rescan_leaf(fs_info, path, trans, tmp, scratch_leaf); } if (err > 0) @@ -2049,7 +2037,6 @@ out: kfree(scratch_leaf); ulist_free(tmp); btrfs_free_path(path); - kfree(qscan); mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; @@ -2068,47 +2055,74 @@ out: } else { pr_err("btrfs: qgroup scan failed with %d\n", err); } -} -static void -qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan) -{ - memset(&qscan->work, 0, sizeof(qscan->work)); - qscan->work.func = btrfs_qgroup_rescan_worker; - qscan->fs_info = fs_info; - - pr_info("btrfs: qgroup scan started\n"); - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work); + complete_all(&fs_info->qgroup_rescan_completion); } -int -btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +/* + * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all + * memory required for the rescan context. + */ +static int +qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, + int init_flags) { int ret = 0; - struct rb_node *n; - struct btrfs_qgroup *qgroup; - struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS); - if (!qscan) - return -ENOMEM; + if (!init_flags && + (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) || + !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) { + ret = -EINVAL; + goto err; + } mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) - ret = -EINPROGRESS; - else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) - ret = -EINVAL; - if (ret) { - spin_unlock(&fs_info->qgroup_lock); - mutex_unlock(&fs_info->qgroup_rescan_lock); - kfree(qscan); - return ret; + + if (init_flags) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = -EINPROGRESS; + else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + ret = -EINVAL; + + if (ret) { + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto err; + } + + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; memset(&fs_info->qgroup_rescan_progress, 0, sizeof(fs_info->qgroup_rescan_progress)); + fs_info->qgroup_rescan_progress.objectid = progress_objectid; + + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + init_completion(&fs_info->qgroup_rescan_completion); + + memset(&fs_info->qgroup_rescan_work, 0, + sizeof(fs_info->qgroup_rescan_work)); + fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; + + if (ret) { +err: + pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret); + return ret; + } + + return 0; +} + +static void +qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + spin_lock(&fs_info->qgroup_lock); /* clear all current qgroup tracking information */ for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { qgroup = rb_entry(n, struct btrfs_qgroup, node); @@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup->excl_cmpr = 0; } spin_unlock(&fs_info->qgroup_lock); - mutex_unlock(&fs_info->qgroup_rescan_lock); +} + +int +btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + struct btrfs_trans_handle *trans; - qgroup_rescan_start(fs_info, qscan); + ret = qgroup_rescan_init(fs_info, 0, 1); + if (ret) + return ret; + + /* + * We have set the rescan_progress to 0, which means no more + * delayed refs will be accounted by btrfs_qgroup_account_ref. + * However, btrfs_qgroup_account_ref may be right after its call + * to btrfs_find_all_roots, in which case it would still do the + * accounting. + * To solve this, we're committing the transaction, which will + * ensure we run all delayed refs and only after that, we are + * going to clear all tracking information for a clean start. + */ + + trans = btrfs_join_transaction(fs_info->fs_root); + if (IS_ERR(trans)) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans, fs_info->fs_root); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } + + qgroup_rescan_zero_tracking(fs_info); + + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); return 0; } + +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +{ + int running; + int ret = 0; + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (running) + ret = wait_for_completion_interruptible( + &fs_info->qgroup_rescan_completion); + + return ret; +} + +/* + * this is only called from open_ctree where we're still single threaded, thus + * locking is omitted here. + */ +void +btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) +{ + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); +} diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0740621..0525e13 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1050,7 +1050,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); if (!bio) return -ENOMEM; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 704a1b8..1209649 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; + u64 last_snap = 0; int ret; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); @@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BTRFS_TREE_RELOC_OBJECTID); BUG_ON(ret); + last_snap = btrfs_root_last_snapshot(&root->root_item); btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); } else { @@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); root_item->drop_level = 0; + /* + * abuse rtransid, it is safe because it is impossible to + * receive data into a relocation tree. + */ + btrfs_set_root_rtransid(root_item, last_snap); + btrfs_set_root_otransid(root_item, trans->transid); } btrfs_tree_unlock(eb); @@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, BUG_ON(ret); kfree(root_item); - reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &root_key); + reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; return reloc_root; @@ -1773,7 +1780,7 @@ again: if (!eb || !extent_buffer_uptodate(eb)) { ret = (!eb) ? -ENOMEM : -EIO; free_extent_buffer(eb); - return ret; + break; } btrfs_tree_lock(eb); if (cow) { @@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list) static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { + struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_root *reloc_root; + u64 last_snap; + u64 otransid; + u64 objectid; LIST_HEAD(reloc_roots); int found = 0; int ret = 0; @@ -2308,12 +2319,44 @@ again: } else { list_del_init(&reloc_root->root_list); } + + /* + * we keep the old last snapshod transid in rtranid when we + * created the relocation tree. + */ + last_snap = btrfs_root_rtransid(&reloc_root->root_item); + otransid = btrfs_root_otransid(&reloc_root->root_item); + objectid = reloc_root->root_key.offset; + ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); if (ret < 0) { if (list_empty(&reloc_root->root_list)) list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; + } else if (!ret) { + /* + * recover the last snapshot tranid to avoid + * the space balance break NOCOW. + */ + root = read_fs_root(rc->extent_root->fs_info, + objectid); + if (IS_ERR(root)) + continue; + + if (btrfs_root_refs(&root->root_item) == 0) + continue; + + trans = btrfs_join_transaction(root); + BUG_ON(IS_ERR(trans)); + + /* Check if the fs/file tree was snapshoted or not. */ + if (btrfs_root_last_snapshot(&root->root_item) == + otransid - 1) + btrfs_set_root_last_snapshot(&root->root_item, + last_snap); + + btrfs_end_transaction(trans, root); } } @@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc, struct btrfs_path *path; struct btrfs_key key; int ret; + bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, + SKINNY_METADATA); if (tree_block_processed(bytenr, blocksize, rc)) return 0; @@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - +again: key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = blocksize; + if (skinny) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + } path->search_commit_root = 1; path->skip_locking = 1; @@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc, if (ret < 0) goto out; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (ret > 0) { - if (key.objectid == bytenr && - key.type == BTRFS_METADATA_ITEM_KEY) - ret = 0; + if (ret > 0 && skinny) { + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + (key.type == BTRFS_METADATA_ITEM_KEY || + (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == blocksize))) + ret = 0; + } + + if (ret) { + skinny = false; + btrfs_release_path(path); + goto again; + } } BUG_ON(ret); @@ -3350,6 +3412,11 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, } truncate: + ret = btrfs_check_trunc_cache_free_space(root, + &fs_info->global_block_rsv); + if (ret) + goto out; + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -4077,7 +4144,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(void) +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -4088,7 +4155,8 @@ static struct reloc_control *alloc_reloc_control(void) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL); + extent_io_tree_init(&rc->processed_blocks, + fs_info->btree_inode->i_mapping); return rc; } @@ -4105,7 +4173,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) int rw = 0; int err = 0; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) return -ENOMEM; @@ -4154,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + ret = btrfs_start_all_delalloc_inodes(fs_info, 0); if (ret < 0) { err = ret; goto out; } - btrfs_wait_ordered_extents(fs_info->tree_root, 0); + btrfs_wait_all_ordered_extents(fs_info, 0); while (1) { mutex_lock(&fs_info->cleaner_mutex); @@ -4271,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.type != BTRFS_ROOT_ITEM_KEY) break; - reloc_root = btrfs_read_fs_root_no_radix(root, &key); + reloc_root = btrfs_read_fs_root(root, &key); if (IS_ERR(reloc_root)) { err = PTR_ERR(reloc_root); goto out; @@ -4306,7 +4374,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(root->fs_info); if (!rc) { err = -ENOMEM; goto out; @@ -4390,10 +4458,8 @@ out: int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) { struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; struct btrfs_ordered_extent *ordered; struct btrfs_root *root = BTRFS_I(inode)->root; - size_t offset; int ret; u64 disk_bytenr; LIST_HEAD(list); @@ -4407,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) if (ret) goto out; + disk_bytenr = ordered->start; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del_init(&sums->list); - sector_sum = sums->sums; - sums->bytenr = ordered->start; - - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } + sums->bytenr = disk_bytenr; + disk_bytenr += sums->len; btrfs_add_ordered_sum(inode, ordered, sums); } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 5bf1ed5..ffb1036 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot, } /* - * lookup the root with the highest offset for a given objectid. The key we do - * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 - * on error. + * btrfs_find_root - lookup the root by the key. + * root: the root of the root tree + * search_key: the key to search + * path: the path we search + * root_item: the root item of the tree we look for + * root_key: the reak key of the tree we look for + * + * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * of the search key, just lookup the root with the highest offset for a + * given objectid. + * + * If we find something return 0, otherwise > 0, < 0 on error. */ -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, - struct btrfs_root_item *item, struct btrfs_key *key) +int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key) { - struct btrfs_path *path; - struct btrfs_key search_key; struct btrfs_key found_key; struct extent_buffer *l; int ret; int slot; - search_key.objectid = objectid; - search_key.type = BTRFS_ROOT_ITEM_KEY; - search_key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0); if (ret < 0) - goto out; + return ret; - BUG_ON(ret == 0); - if (path->slots[0] == 0) { - ret = 1; - goto out; + if (search_key->offset != -1ULL) { /* the search key is exact */ + if (ret > 0) + goto out; + } else { + BUG_ON(ret == 0); /* Logical error */ + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + ret = 0; } + l = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != objectid || + if (found_key.objectid != search_key->objectid || found_key.type != BTRFS_ROOT_ITEM_KEY) { ret = 1; goto out; } - if (item) - btrfs_read_root_item(l, slot, item); - if (key) - memcpy(key, &found_key, sizeof(found_key)); - ret = 0; + if (root_item) + btrfs_read_root_item(l, slot, root_item); + if (root_key) + memcpy(root_key, &found_key, sizeof(found_key)); out: - btrfs_free_path(path); + btrfs_release_path(path); return ret; } @@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } -/* - * at mount time we want to find all the old transaction snapshots that were in - * the process of being deleted if we crashed. This is any root item with an - * offset lower than the latest root. They need to be queued for deletion to - * finish what was happening when we crashed. - */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_root *dead_root; - struct btrfs_root_item *ri; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_path *path; - int ret; - u32 nritems; - struct extent_buffer *leaf; - int slot; - - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - -again: - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - if (slot >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) - goto next; - - if (key.objectid < objectid) - goto next; - - if (key.objectid > objectid) - break; - - ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); - if (btrfs_disk_root_refs(leaf, ri) != 0) - goto next; - - memcpy(&found_key, &key, sizeof(key)); - key.offset++; - btrfs_release_path(path); - dead_root = - btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &found_key); - if (IS_ERR(dead_root)) { - ret = PTR_ERR(dead_root); - goto err; - } - - ret = btrfs_add_dead_root(dead_root); - if (ret) - goto err; - goto again; -next: - slot++; - path->slots[0]++; - } - ret = 0; -err: - btrfs_free_path(path); - return ret; -} - int btrfs_find_orphan_roots(struct btrfs_root *tree_root) { struct extent_buffer *leaf; @@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) struct btrfs_root *root; int err = 0; int ret; + bool can_recover = true; + + if (tree_root->fs_info->sb->s_flags & MS_RDONLY) + can_recover = false; path = btrfs_alloc_path(); if (!path) @@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid = key.offset; key.offset++; - root = btrfs_read_fs_root_no_name(tree_root->fs_info, - &root_key); - if (!IS_ERR(root)) + root = btrfs_read_fs_root(tree_root, &root_key); + err = PTR_RET(root); + if (err && err != -ENOENT) { + break; + } else if (err == -ENOENT) { + struct btrfs_trans_handle *trans; + + btrfs_release_path(path); + + trans = btrfs_join_transaction(tree_root); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_error(tree_root->fs_info, err, + "Failed to start trans to delete " + "orphan item"); + break; + } + err = btrfs_del_orphan_item(trans, tree_root, + root_key.objectid); + btrfs_end_transaction(trans, tree_root); + if (err) { + btrfs_error(tree_root->fs_info, err, + "Failed to delete root orphan " + "item"); + break; + } continue; + } - ret = PTR_ERR(root); - if (ret != -ENOENT) { - err = ret; + if (btrfs_root_refs(&root->root_item) == 0) { + btrfs_add_dead_root(root); + continue; + } + + err = btrfs_init_fs_root(root); + if (err) { + btrfs_free_fs_root(root); break; } - ret = btrfs_find_dead_roots(tree_root, root_key.objectid); - if (ret) { - err = ret; + root->orphan_item_inserted = 1; + + err = btrfs_insert_fs_root(root->fs_info, root); + if (err) { + BUG_ON(err == -EEXIST); + btrfs_free_fs_root(root); break; } } @@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_path *path; int ret; - struct btrfs_root_item *ri; - struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) @@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, goto out; BUG_ON(ret != 0); - leaf = path->nodes[0]; - ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); ret = btrfs_del_item(trans, root, path); out: diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f489e24..4ba2a69 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1296,7 +1296,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!page->page); - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { page->io_error = 1; sblock->no_io_error_seen = 0; @@ -1431,7 +1431,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; bio->bi_bdev = page_bad->dev->bdev; @@ -1522,7 +1522,7 @@ again: sbio->dev = wr_ctx->tgtdev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); if (!bio) { mutex_unlock(&wr_ctx->wr_lock); return -ENOMEM; @@ -1930,7 +1930,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); + bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); if (!bio) return -ENOMEM; sbio->bio = bio; @@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, u8 *csum) { struct btrfs_ordered_sum *sum = NULL; - int ret = 0; - unsigned long i; + unsigned long index; unsigned long num_sectors; while (!list_empty(&sctx->csum_list)) { @@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, if (!sum) return 0; + index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; num_sectors = sum->len / sctx->sectorsize; - for (i = 0; i < num_sectors; ++i) { - if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sctx->csum_size); - ret = 1; - break; - } - } - if (ret && i == num_sectors - 1) { + memcpy(csum, sum->sums + index, sctx->csum_size); + if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); } - return ret; + return 1; } /* scrub extent tries to collect up to 64 kB for each bio */ @@ -2505,6 +2499,7 @@ again: if (ret) goto out; + scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { logical += increment; @@ -3204,16 +3199,18 @@ out: static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) { - unsigned long index; struct scrub_copy_nocow_ctx *nocow_ctx = ctx; - int ret = 0; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct page *page; struct btrfs_root *local_root; u64 physical_for_dev_replace; u64 len; - struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + unsigned long index; int srcu_index; + int ret; + int err; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; @@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) return PTR_ERR(local_root); } + if (btrfs_root_refs(&local_root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); + return -ENOENT; + } + key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; @@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); + /* Avoid truncate/dio/punch hole.. */ + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + + ret = 0; physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; len = nocow_ctx->len; while (len >= PAGE_CACHE_SIZE) { - struct page *page = NULL; - int ret_sub; - index = offset >> PAGE_CACHE_SHIFT; - +again: page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { pr_err("find_or_create_page() failed\n"); ret = -ENOMEM; - goto next_page; + goto out; } if (PageUptodate(page)) { @@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) goto next_page; } else { ClearPageError(page); - ret_sub = extent_read_full_page(&BTRFS_I(inode)-> + err = extent_read_full_page(&BTRFS_I(inode)-> io_tree, page, btrfs_get_extent, nocow_ctx->mirror_num); - if (ret_sub) { - ret = ret_sub; + if (err) { + ret = err; goto next_page; } - wait_on_page_locked(page); + + lock_page(page); + /* + * If the page has been remove from the page cache, + * the data on it is meaningless, because it may be + * old one, the new data may be written into the new + * page in the page cache. + */ + if (page->mapping != inode->i_mapping) { + page_cache_release(page); + goto again; + } if (!PageUptodate(page)) { ret = -EIO; goto next_page; } } - ret_sub = write_page_nocow(nocow_ctx->sctx, - physical_for_dev_replace, page); - if (ret_sub) { - ret = ret_sub; - goto next_page; - } - + err = write_page_nocow(nocow_ctx->sctx, + physical_for_dev_replace, page); + if (err) + ret = err; next_page: - if (page) { - unlock_page(page); - put_page(page); - } + unlock_page(page); + page_cache_release(page); + + if (ret) + break; + offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } - - if (inode) - iput(inode); +out: + mutex_unlock(&inode->i_mutex); + iput(inode); return ret; } @@ -3307,7 +3321,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); return -EIO; } - bio = bio_alloc(GFP_NOFS, 1); + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ff40f1c..d3f3b43 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p) } } -static struct fs_path *fs_path_alloc(struct send_ctx *sctx) +static struct fs_path *fs_path_alloc(void) { struct fs_path *p; @@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx) return p; } -static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) +static struct fs_path *fs_path_alloc_reversed(void) { struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return NULL; p->reversed = 1; @@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) return p; } -static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) +static void fs_path_free(struct fs_path *p) { if (!p) return; @@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, * * path must point to the INODE_REF or INODE_EXTREF when called. */ -static int iterate_inode_ref(struct send_ctx *sctx, - struct btrfs_root *root, struct btrfs_path *path, +static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, int resolve, iterate_inode_ref_t iterate, void *ctx) { @@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx, unsigned long elem_size; unsigned long ptr; - p = fs_path_alloc_reversed(sctx); + p = fs_path_alloc_reversed(); if (!p) return -ENOMEM; tmp_path = alloc_path_for_send(); if (!tmp_path) { - fs_path_free(sctx, p); + fs_path_free(p); return -ENOMEM; } @@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx, out: btrfs_free_path(tmp_path); - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, * * path must point to the dir item when called. */ -static int iterate_dir_item(struct send_ctx *sctx, - struct btrfs_root *root, struct btrfs_path *path, +static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *found_key, iterate_dir_item_t iterate, void *ctx) { @@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index, * Retrieve the first path of an inode. If an inode has more then one * ref/hardlink, this is ignored. */ -static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, +static int get_inode_path(struct btrfs_root *root, u64 ino, struct fs_path *path) { int ret; @@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, goto out; } - ret = iterate_inode_ref(sctx, root, p, &found_key, 1, - __copy_first_ref, path); + ret = iterate_inode_ref(root, p, &found_key, 1, + __copy_first_ref, path); if (ret < 0) goto out; ret = 0; @@ -1314,8 +1312,7 @@ out: return ret; } -static int read_symlink(struct send_ctx *sctx, - struct btrfs_root *root, +static int read_symlink(struct btrfs_root *root, u64 ino, struct fs_path *dest) { @@ -1562,8 +1559,7 @@ out: * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, * generation of the parent dir and the name of the dir entry. */ -static int get_first_ref(struct send_ctx *sctx, - struct btrfs_root *root, u64 ino, +static int get_first_ref(struct btrfs_root *root, u64 ino, u64 *dir, u64 *dir_gen, struct fs_path *name) { int ret; @@ -1628,8 +1624,7 @@ out: return ret; } -static int is_first_ref(struct send_ctx *sctx, - struct btrfs_root *root, +static int is_first_ref(struct btrfs_root *root, u64 ino, u64 dir, const char *name, int name_len) { @@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx, u64 tmp_dir; u64 tmp_dir_gen; - tmp_name = fs_path_alloc(sctx); + tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; - ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); if (ret < 0) goto out; @@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx, ret = !memcmp(tmp_name->start, name, name_len); out: - fs_path_free(sctx, tmp_name); + fs_path_free(tmp_name); return ret; } @@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) if (!sctx->parent_root) goto out; - name = fs_path_alloc(sctx); + name = fs_path_alloc(); if (!name) return -ENOMEM; - ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); + ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); if (ret < 0) goto out; @@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) name->start, fs_path_len(name)); out: - fs_path_free(sctx, name); + fs_path_free(name); return ret; } @@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, * send_root or parent_root for ref lookup. */ if (ino < sctx->send_progress) - ret = get_first_ref(sctx, sctx->send_root, ino, - parent_ino, parent_gen, dest); + ret = get_first_ref(sctx->send_root, ino, + parent_ino, parent_gen, dest); else - ret = get_first_ref(sctx, sctx->parent_root, ino, - parent_ino, parent_gen, dest); + ret = get_first_ref(sctx->parent_root, ino, + parent_ino, parent_gen, dest); if (ret < 0) goto out; @@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_gen = 0; int stop = 0; - name = fs_path_alloc(sctx); + name = fs_path_alloc(); if (!name) { ret = -ENOMEM; goto out; @@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, } out: - fs_path_free(sctx, name); + fs_path_free(name); if (!ret) fs_path_unreverse(dest); return ret; @@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) verbose_printk("btrfs: send_utimes %llu\n", ino); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); btrfs_free_path(path); return ret; } @@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) verbose_printk("btrfs: send_create_inode %llu\n", ino); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); if (S_ISLNK(mode)) { fs_path_reset(p); - ret = read_symlink(sctx, sctx->send_root, ino, p); + ret = read_symlink(sctx->send_root, ino, p); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); @@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir, return 0; } -static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) +static void __free_recorded_refs(struct list_head *head) { struct recorded_ref *cur; while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(sctx, cur->full_path); + fs_path_free(cur->full_path); list_del(&cur->list); kfree(cur); } @@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) static void free_recorded_refs(struct send_ctx *sctx) { - __free_recorded_refs(sctx, &sctx->new_refs); - __free_recorded_refs(sctx, &sctx->deleted_refs); + __free_recorded_refs(&sctx->new_refs); + __free_recorded_refs(&sctx->deleted_refs); } /* @@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, int ret; struct fs_path *orphan; - orphan = fs_path_alloc(sctx); + orphan = fs_path_alloc(); if (!orphan) return -ENOMEM; @@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, ret = send_rename(sctx, path, orphan); out: - fs_path_free(sctx, orphan); + fs_path_free(orphan); return ret; } @@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); - valid_path = fs_path_alloc(sctx); + valid_path = fs_path_alloc(); if (!valid_path) { ret = -ENOMEM; goto out; @@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = is_first_ref(sctx, sctx->parent_root, - ow_inode, cur->dir, cur->name, - cur->name_len); + ret = is_first_ref(sctx->parent_root, + ow_inode, cur->dir, cur->name, + cur->name_len); if (ret < 0) goto out; if (ret) { @@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); out: free_recorded_refs(sctx); ulist_free(check_dirs); - fs_path_free(sctx, valid_path); + fs_path_free(valid_path); return ret; } @@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index, struct fs_path *p; u64 gen; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index, out: if (ret) - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index, struct fs_path *p; u64 gen; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index, out: if (ret) - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, __record_new_ref, sctx); if (ret < 0) goto out; ret = 0; @@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx) { int ret; - ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, __record_deleted_ref, sctx); if (ret < 0) goto out; ret = 0; @@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index, return 0; } -static int find_iref(struct send_ctx *sctx, - struct btrfs_root *root, +static int find_iref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, u64 dir, struct fs_path *name) @@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx, ctx.name = name; ctx.found_idx = -1; - ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); + ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); if (ret < 0) return ret; @@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index, int ret; struct send_ctx *sctx = ctx; - ret = find_iref(sctx, sctx->parent_root, sctx->right_path, + ret = find_iref(sctx->parent_root, sctx->right_path, sctx->cmp_key, dir, name); if (ret == -ENOENT) ret = __record_new_ref(num, dir, index, name, sctx); @@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index, int ret; struct send_ctx *sctx = ctx; - ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, + ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, dir, name); if (ret == -ENOENT) ret = __record_deleted_ref(num, dir, index, name, sctx); @@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx) { int ret = 0; - ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, __record_changed_new_ref, sctx); if (ret < 0) goto out; - ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); if (ret < 0) goto out; @@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx, found_key.type != BTRFS_INODE_EXTREF_KEY)) break; - ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, - sctx); + ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); btrfs_release_path(path); if (ret < 0) goto out; @@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, struct fs_path *p; posix_acl_xattr_header dummy_acl; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, ret = send_set_xattr(sctx, p, name, name_len, data, data_len); out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key, struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key, ret = send_remove_xattr(sctx, p, name, name_len); out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx) { int ret = 0; - ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_new_xattr, sctx); + ret = iterate_dir_item(sctx->send_root, sctx->left_path, + sctx->cmp_key, __process_new_xattr, sctx); return ret; } @@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx) { int ret; - ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_deleted_xattr, sctx); + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, + sctx->cmp_key, __process_deleted_xattr, sctx); return ret; } @@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key, strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; - ctx->found_data = kmalloc(data_len, GFP_NOFS); + ctx->found_data = kmemdup(data, data_len, GFP_NOFS); if (!ctx->found_data) return -ENOMEM; - memcpy(ctx->found_data, data, data_len); return 1; } return 0; } -static int find_xattr(struct send_ctx *sctx, - struct btrfs_root *root, +static int find_xattr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, const char *name, int name_len, @@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx, ctx.found_data = NULL; ctx.found_data_len = 0; - ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); + ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); if (ret < 0) return ret; @@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, char *found_data = NULL; int found_data_len = 0; - ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, - sctx->cmp_key, name, name_len, &found_data, - &found_data_len); + ret = find_xattr(sctx->parent_root, sctx->right_path, + sctx->cmp_key, name, name_len, &found_data, + &found_data_len); if (ret == -ENOENT) { ret = __process_new_xattr(num, di_key, name, name_len, data, data_len, type, ctx); @@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, int ret; struct send_ctx *sctx = ctx; - ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, - name, name_len, NULL, NULL); + ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, + name, name_len, NULL, NULL); if (ret == -ENOENT) ret = __process_deleted_xattr(num, di_key, name, name_len, data, data_len, type, ctx); @@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx) { int ret = 0; - ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, + ret = iterate_dir_item(sctx->send_root, sctx->left_path, sctx->cmp_key, __process_changed_new_xattr, sctx); if (ret < 0) goto out; - ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, + ret = iterate_dir_item(sctx->parent_root, sctx->right_path, sctx->cmp_key, __process_changed_deleted_xattr, sctx); out: @@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx) goto out; } - ret = iterate_dir_item(sctx, root, path, &found_key, - __process_new_xattr, sctx); + ret = iterate_dir_item(root, path, &found_key, + __process_new_xattr, sctx); if (ret < 0) goto out; @@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) int num_read = 0; mm_segment_t old_fs; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len); tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); set_fs(old_fs); if (ret < 0) return ret; @@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " clone_root->root->objectid, clone_root->ino, clone_root->offset); - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); } else { - ret = get_inode_path(sctx, clone_root->root, - clone_root->ino, p); + ret = get_inode_path(clone_root->root, clone_root->ino, p); } if (ret < 0) goto out; @@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(sctx); + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx, tlv_put_failure: out: - fs_path_free(sctx, p); + fs_path_free(p); return ret; } @@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) send_root = BTRFS_I(file_inode(mnt_file))->root; fs_info = send_root->fs_info; + /* + * This is done when we lookup the root, it should already be complete + * by the time we get here. + */ + WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); + + /* + * If we just created this root we need to make sure that the orphan + * cleanup has been done and committed since we search the commit root, + * so check its commit root transid with our otransid and if they match + * commit the transaction to make sure everything is updated. + */ + down_read(&send_root->fs_info->extent_commit_sem); + if (btrfs_header_generation(send_root->commit_root) == + btrfs_root_otransid(&send_root->root_item)) { + struct btrfs_trans_handle *trans; + + up_read(&send_root->fs_info->extent_commit_sem); + + trans = btrfs_attach_transaction_barrier(send_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto out; + } + /* ENOENT means theres no transaction */ + } else { + ret = btrfs_commit_transaction(trans, send_root); + if (ret) + goto out; + } + } else { + up_read(&send_root->fs_info->extent_commit_sem); + } + arg = memdup_user(arg_, sizeof(*arg)); if (IS_ERR(arg)) { ret = PTR_ERR(arg); @@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; clone_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (!clone_root) { - ret = -EINVAL; - goto out; - } if (IS_ERR(clone_root)) { ret = PTR_ERR(clone_root); goto out; @@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (!sctx->parent_root) { - ret = -EINVAL; + if (IS_ERR(sctx->parent_root)) { + ret = PTR_ERR(sctx->parent_root); goto out; } } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a4807ce..8eb6191 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -51,7 +51,6 @@ #include "print-tree.h" #include "xattr.h" #include "volumes.h" -#include "version.h" #include "export.h" #include "compression.h" #include "rcu-string.h" @@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, return; } ACCESS_ONCE(trans->transaction->aborted) = errno; + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&root->fs_info->transaction_wait); + wake_up(&root->fs_info->transaction_blocked_wait); __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* @@ -776,9 +778,6 @@ find_root: if (IS_ERR(new_root)) return ERR_CAST(new_root); - if (btrfs_root_refs(&new_root->root_item) == 0) - return ERR_PTR(-ENOENT); - dir_id = btrfs_root_dirid(&new_root->root_item); setup_root: location.objectid = dir_id; @@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_extents(root, 1); + btrfs_wait_all_ordered_extents(fs_info, 1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1263,6 +1262,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); ret = btrfs_commit_super(root); if (ret) @@ -1684,6 +1684,18 @@ static void btrfs_interface_exit(void) printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); } +static void btrfs_print_info(void) +{ + printk(KERN_INFO "Btrfs loaded" +#ifdef CONFIG_BTRFS_DEBUG + ", debug=on" +#endif +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + ", integrity-checker=on" +#endif + "\n"); +} + static int __init init_btrfs_fs(void) { int err; @@ -1732,11 +1744,9 @@ static int __init init_btrfs_fs(void) btrfs_init_lockdep(); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + btrfs_print_info(); btrfs_test_free_space_cache(); -#endif - printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); return 0; unregister_ioctl: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0544587..d58cce7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -34,12 +34,43 @@ #define BTRFS_ROOT_TRANS_TAG 0 +static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { + [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | + __TRANS_START), + [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH), + [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN), + [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), + [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE | + __TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK), +}; + static void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); + while (!list_empty(&transaction->pending_chunks)) { + struct extent_map *em; + + em = list_first_entry(&transaction->pending_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root) root->commit_root = btrfs_root_node(root); } -static inline int can_join_transaction(struct btrfs_transaction *trans, - int type) +static inline void extwriter_counter_inc(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_inc(&trans->num_extwriters); +} + +static inline void extwriter_counter_dec(struct btrfs_transaction *trans, + unsigned int type) +{ + if (type & TRANS_EXTWRITERS) + atomic_dec(&trans->num_extwriters); +} + +static inline void extwriter_counter_init(struct btrfs_transaction *trans, + unsigned int type) +{ + atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); +} + +static inline int extwriter_counter_read(struct btrfs_transaction *trans) { - return !(trans->in_commit && - type != TRANS_JOIN && - type != TRANS_JOIN_NOLOCK); + return atomic_read(&trans->num_extwriters); } /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root, int type) +static noinline int join_transaction(struct btrfs_root *root, unsigned int type) { struct btrfs_transaction *cur_trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -74,32 +122,19 @@ loop: return -EROFS; } - if (fs_info->trans_no_join) { - /* - * If we are JOIN_NOLOCK we're already committing a current - * transaction, we just need a handle to deal with something - * when committing the transaction, such as inode cache and - * space cache. It is a special case. - */ - if (type != TRANS_JOIN_NOLOCK) { - spin_unlock(&fs_info->trans_lock); - return -EBUSY; - } - } - cur_trans = fs_info->running_transaction; if (cur_trans) { if (cur_trans->aborted) { spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } - if (!can_join_transaction(cur_trans, type)) { + if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); return -EBUSY; } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); - cur_trans->num_joined++; + extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); return 0; } @@ -112,6 +147,12 @@ loop: if (type == TRANS_ATTACH) return -ENOENT; + /* + * JOIN_NOLOCK only happens during the transaction commit, so + * it is impossible that ->running_transaction is NULL + */ + BUG_ON(type == TRANS_JOIN_NOLOCK); + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; @@ -120,7 +161,7 @@ loop: if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure - * to redo the trans_no_join checks above + * to redo the checks above */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); goto loop; @@ -131,17 +172,15 @@ loop: } atomic_set(&cur_trans->num_writers, 1); - cur_trans->num_joined = 0; + extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); - cur_trans->in_commit = 0; - cur_trans->blocked = 0; + cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); - cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); cur_trans->delayed_refs.root = RB_ROOT; @@ -164,7 +203,6 @@ loop: "creating a fresh transaction\n"); atomic64_set(&fs_info->tree_mod_seq, 0); - spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); atomic_set(&cur_trans->delayed_refs.ref_seq, 0); @@ -172,6 +210,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); + INIT_LIST_HEAD(&cur_trans->pending_chunks); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; } +static inline int is_transaction_blocked(struct btrfs_transaction *trans) +{ + return (trans->state >= TRANS_STATE_BLOCKED && + trans->state < TRANS_STATE_UNBLOCKED && + !trans->aborted); +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root) spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; - if (cur_trans && cur_trans->blocked) { + if (cur_trans && is_transaction_blocked(cur_trans)) { atomic_inc(&cur_trans->use_count); spin_unlock(&root->fs_info->trans_lock); wait_event(root->fs_info->transaction_wait, - !cur_trans->blocked); + cur_trans->state >= TRANS_STATE_UNBLOCKED || + cur_trans->aborted); put_transaction(cur_trans); } else { spin_unlock(&root->fs_info->trans_lock); @@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, int type, +start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; @@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, return ERR_PTR(-EROFS); if (current->journal_info) { - WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); + WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; WARN_ON(h->use_count > 2); @@ -366,7 +413,7 @@ again: * If we are ATTACH, it means we just want to catch the current * transaction and commit it, so we needn't do sb_start_intwrite(). */ - if (type < TRANS_JOIN_NOLOCK) + if (type & __TRANS_FREEZABLE) sb_start_intwrite(root->fs_info->sb); if (may_wait_transaction(root, type)) @@ -408,7 +455,8 @@ again: INIT_LIST_HEAD(&h->new_bgs); smp_mb(); - if (cur_trans->blocked && may_wait_transaction(root, type)) { + if (cur_trans->state >= TRANS_STATE_BLOCKED && + may_wait_transaction(root, type)) { btrfs_commit_transaction(h, root); goto again; } @@ -429,7 +477,7 @@ got_it: return h; join_fail: - if (type < TRANS_JOIN_NOLOCK) + if (type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: @@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) } /* - * btrfs_attach_transaction() - catch the running transaction + * btrfs_attach_transaction_barrier() - catch the running transaction * * It is similar to the above function, the differentia is this one * will wait for all the inactive transactions until they fully @@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { - wait_event(commit->commit_wait, commit->commit_done); + wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); } int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) @@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) spin_lock(&root->fs_info->trans_lock); list_for_each_entry_reverse(t, &root->fs_info->trans_list, list) { - if (t->in_commit) { - if (t->commit_done) + if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; atomic_inc(&cur_trans->use_count); @@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root) static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int ret; + if (root->fs_info->global_block_rsv.space_info->full && + btrfs_should_throttle_delayed_refs(trans, root)) + return 1; - ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); - return ret ? 1 : 0; + return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); } int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, @@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, int err; smp_mb(); - if (cur_trans->blocked || cur_trans->delayed_refs.flushing) + if (cur_trans->state >= TRANS_STATE_BLOCKED || + cur_trans->delayed_refs.flushing) return 1; updates = trans->delayed_ref_updates; @@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; - int count = 0; + unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; @@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 1) { - unsigned long cur = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (btrfs_should_throttle_delayed_refs(trans, root)) { + cur = max_t(unsigned long, cur, 1); trans->delayed_ref_updates = 0; - if (cur && - trans->transaction->delayed_refs.num_heads_ready > 64) { - trans->delayed_ref_updates = 0; - btrfs_run_delayed_refs(trans, root, cur); - } else { - break; - } - count++; + btrfs_run_delayed_refs(trans, root, cur); } btrfs_trans_release_metadata(trans, root); @@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && - should_end_transaction(trans, root)) { - trans->transaction->blocked = 1; - smp_wmb(); + should_end_transaction(trans, root) && + ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { + spin_lock(&info->trans_lock); + if (cur_trans->state == TRANS_STATE_RUNNING) + cur_trans->state = TRANS_STATE_BLOCKED; + spin_unlock(&info->trans_lock); } - if (lock && cur_trans->blocked && !cur_trans->in_commit) { + if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { if (throttle) { /* * We may race with somebody else here so end up having @@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } } - if (trans->type < TRANS_JOIN_NOLOCK) + if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); atomic_dec(&cur_trans->num_writers); + extwriter_counter_dec(cur_trans, trans->type); smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) @@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; - struct blk_plug plug; - blk_start_plug(&plug); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root, } if (err) werr = err; - blk_finish_plug(&plug); return werr; } @@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, { int ret; int ret2; + struct blk_plug plug; + blk_start_plug(&plug); ret = btrfs_write_marked_extents(root, dirty_pages, mark); + blk_finish_plug(&plug); ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); if (ret) @@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root) int btrfs_transaction_in_commit(struct btrfs_fs_info *info) { + struct btrfs_transaction *trans; int ret = 0; + spin_lock(&info->trans_lock); - if (info->running_transaction) - ret = info->running_transaction->in_commit; + trans = info->running_transaction; + if (trans) + ret = (trans->state >= TRANS_STATE_COMMIT_START); spin_unlock(&info->trans_lock); return ret; } int btrfs_transaction_blocked(struct btrfs_fs_info *info) { + struct btrfs_transaction *trans; int ret = 0; + spin_lock(&info->trans_lock); - if (info->running_transaction) - ret = info->running_transaction->blocked; + trans = info->running_transaction; + if (trans) + ret = is_transaction_blocked(trans); spin_unlock(&info->trans_lock); return ret; } @@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) static void wait_current_trans_commit_start(struct btrfs_root *root, struct btrfs_transaction *trans) { - wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); + wait_event(root->fs_info->transaction_blocked_wait, + trans->state >= TRANS_STATE_COMMIT_START || + trans->aborted); } /* @@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_transaction *trans) { wait_event(root->fs_info->transaction_wait, - trans->commit_done || (trans->in_commit && !trans->blocked)); + trans->state >= TRANS_STATE_UNBLOCKED || + trans->aborted); } /* @@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); - if (list_empty(&cur_trans->list)) { - spin_unlock(&root->fs_info->trans_lock); - btrfs_end_transaction(trans, root); - return; - } + /* + * If the transaction is removed from the list, it means this + * transaction has been committed successfully, so it is impossible + * to call the cleanup function. + */ + BUG_ON(list_empty(&cur_trans->list)); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { - root->fs_info->trans_no_join = 1; + cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&root->fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); spin_lock(&root->fs_info->trans_lock); - root->fs_info->running_transaction = NULL; } spin_unlock(&root->fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, root); + spin_lock(&root->fs_info->trans_lock); + if (cur_trans == root->fs_info->running_transaction) + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->trans_lock); + put_transaction(cur_trans); put_transaction(cur_trans); @@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, current->journal_info = NULL; kmem_cache_free(btrfs_trans_handle_cachep, trans); - - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 0; - spin_unlock(&root->fs_info->trans_lock); } static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - int snap_pending = 0; int ret; - if (!flush_on_commit) { - spin_lock(&root->fs_info->trans_lock); - if (!list_empty(&trans->transaction->pending_snapshots)) - snap_pending = 1; - spin_unlock(&root->fs_info->trans_lock); - } - - if (flush_on_commit || snap_pending) { - ret = btrfs_start_delalloc_inodes(root, 1); - if (ret) - return ret; - btrfs_wait_ordered_extents(root, 1); - } - ret = btrfs_run_delayed_items(trans, root); if (ret) return ret; @@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, return ret; } -/* - * btrfs_transaction state sequence: - * in_commit = 0, blocked = 0 (initial) - * in_commit = 1, blocked = 1 - * blocked = 0 - * commit_done = 1 - */ +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + return btrfs_start_all_delalloc_inodes(fs_info, 1); + return 0; +} + +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +{ + if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + btrfs_wait_all_ordered_extents(fs_info, 1); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - unsigned long joined = 0; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; - DEFINE_WAIT(wait); int ret; - int should_grow = 0; - unsigned long now = get_seconds(); ret = btrfs_run_ordered_operations(trans, root, 0); if (ret) { @@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * start sending their work down. */ cur_trans->delayed_refs.flushing = 1; + smp_wmb(); if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); @@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - spin_lock(&cur_trans->commit_lock); - if (cur_trans->in_commit) { - spin_unlock(&cur_trans->commit_lock); + spin_lock(&root->fs_info->trans_lock); + if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); ret = btrfs_end_transaction(trans, root); @@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - trans->transaction->in_commit = 1; - trans->transaction->blocked = 1; - spin_unlock(&cur_trans->commit_lock); + cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&root->fs_info->transaction_blocked_wait); - spin_lock(&root->fs_info->trans_lock); if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (!prev_trans->commit_done) { + if (prev_trans->state != TRANS_STATE_COMPLETED) { atomic_inc(&prev_trans->use_count); spin_unlock(&root->fs_info->trans_lock); @@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); } - if (!btrfs_test_opt(root, SSD) && - (now < cur_trans->start_time || now - cur_trans->start_time < 1)) - should_grow = 1; - - do { - joined = cur_trans->num_joined; - - WARN_ON(cur_trans != trans->transaction); - - ret = btrfs_flush_all_pending_stuffs(trans, root); - if (ret) - goto cleanup_transaction; + extwriter_counter_dec(cur_trans, trans->type); - prepare_to_wait(&cur_trans->writer_wait, &wait, - TASK_UNINTERRUPTIBLE); + ret = btrfs_start_delalloc_flush(root->fs_info); + if (ret) + goto cleanup_transaction; - if (atomic_read(&cur_trans->num_writers) > 1) - schedule_timeout(MAX_SCHEDULE_TIMEOUT); - else if (should_grow) - schedule_timeout(1); + ret = btrfs_flush_all_pending_stuffs(trans, root); + if (ret) + goto cleanup_transaction; - finish_wait(&cur_trans->writer_wait, &wait); - } while (atomic_read(&cur_trans->num_writers) > 1 || - (should_grow && cur_trans->num_joined != joined)); + wait_event(cur_trans->writer_wait, + extwriter_counter_read(cur_trans) == 0); + /* some pending stuffs might be added after the previous flush. */ ret = btrfs_flush_all_pending_stuffs(trans, root); if (ret) goto cleanup_transaction; + btrfs_wait_delalloc_flush(root->fs_info); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting - * no_join so make sure to wait for num_writers to == 1 again. + * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; + cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&root->fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); @@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, sizeof(*root->fs_info->super_copy)); - trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); + cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; - root->fs_info->trans_no_join = 0; spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->reloc_mutex); @@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - cur_trans->commit_done = 1; - root->fs_info->last_trans_committed = cur_trans->transid; - + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); spin_lock(&root->fs_info->trans_lock); @@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); - if (trans->type < TRANS_JOIN_NOLOCK) + if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(root->fs_info->sb); trace_btrfs_transaction_commit(root); @@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) int ret; struct btrfs_fs_info *fs_info = root->fs_info; - if (fs_info->sb->s_flags & MS_RDONLY) { - pr_debug("btrfs: cleaner called for RO fs!\n"); - return 0; - } - spin_lock(&fs_info->trans_lock); if (list_empty(&fs_info->dead_roots)) { spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 24c9733..005b037 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -22,21 +22,33 @@ #include "delayed-ref.h" #include "ctree.h" +enum btrfs_trans_state { + TRANS_STATE_RUNNING = 0, + TRANS_STATE_BLOCKED = 1, + TRANS_STATE_COMMIT_START = 2, + TRANS_STATE_COMMIT_DOING = 3, + TRANS_STATE_UNBLOCKED = 4, + TRANS_STATE_COMPLETED = 5, + TRANS_STATE_MAX = 6, +}; + struct btrfs_transaction { u64 transid; /* + * total external writers(USERSPACE/START/ATTACH) in this + * transaction, it must be zero before the transaction is + * being committed + */ + atomic_t num_extwriters; + /* * total writers in this transaction, it must be zero before the * transaction can end */ atomic_t num_writers; atomic_t use_count; - unsigned long num_joined; - - spinlock_t commit_lock; - int in_commit; - int commit_done; - int blocked; + /* Be protected by fs_info->trans_lock when we want to change it. */ + enum btrfs_trans_state state; struct list_head list; struct extent_io_tree dirty_pages; unsigned long start_time; @@ -44,17 +56,27 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head ordered_operations; + struct list_head pending_chunks; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; -enum btrfs_trans_type { - TRANS_START, - TRANS_JOIN, - TRANS_USERSPACE, - TRANS_JOIN_NOLOCK, - TRANS_ATTACH, -}; +#define __TRANS_FREEZABLE (1U << 0) + +#define __TRANS_USERSPACE (1U << 8) +#define __TRANS_START (1U << 9) +#define __TRANS_ATTACH (1U << 10) +#define __TRANS_JOIN (1U << 11) +#define __TRANS_JOIN_NOLOCK (1U << 12) + +#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) +#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) +#define TRANS_ATTACH (__TRANS_ATTACH) +#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) +#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) + +#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ + __TRANS_ATTACH) struct btrfs_trans_handle { u64 transid; @@ -70,7 +92,7 @@ struct btrfs_trans_handle { short aborted; short adding_csums; bool allocating_chunk; - enum btrfs_trans_type type; + unsigned int type; /* * this root is only needed to validate that the root passed to * start_transaction is the same as the one passed to end_transaction. diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c276ac9..2c67914 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include <linux/list_sort.h> #include "ctree.h" #include "transaction.h" @@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log, { int ret = 0; + /* + * If this fs is mixed then we need to be able to process the leaves to + * pin down any logged extents, so we have to read the block. + */ + if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; + } + if (wc->pin) ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, eb->start, eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { + if (wc->pin && btrfs_header_level(eb) == 0) + ret = btrfs_exclude_logged_extents(log, eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) @@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); if (ret) break; - } else if (key.type == BTRFS_INODE_REF_KEY) { - ret = add_inode_ref(wc->trans, root, log, path, - eb, i, &key); - if (ret && ret != -ENOENT) - break; - ret = 0; - } else if (key.type == BTRFS_INODE_EXTREF_KEY) { + } else if (key.type == BTRFS_INODE_REF_KEY || + key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); if (ret && ret != -ENOENT) @@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; unsigned long log_transid = 0; + struct blk_plug plug; mutex_lock(&root->log_mutex); log_transid = root->log_transid; @@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* we start IO on all the marked extents here, but we don't actually * wait for them until later. */ + blk_start_plug(&plug); ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { + blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { + blk_finish_plug(&plug); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); @@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - ret = btrfs_write_and_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); + ret = btrfs_write_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); + blk_finish_plug(&plug); if (ret) { btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); @@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); btrfs_wait_logged_extents(log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, @@ -4016,8 +4034,7 @@ again: if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) break; - log = btrfs_read_fs_root_no_radix(log_root_tree, - &found_key); + log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); btrfs_error(fs_info, ret, diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 7b417e2..b0a523b2 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 new_alloced = ulist->nodes_alloced + 128; struct ulist_node *new_nodes; void *old = NULL; + int i; + + for (i = 0; i < ulist->nnodes; i++) + rb_erase(&ulist->nodes[i].rb_node, &ulist->root); /* * if nodes_alloced == ULIST_SIZE no memory has been allocated @@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, ulist->nodes = new_nodes; ulist->nodes_alloced = new_alloced; + + /* + * krealloc actually uses memcpy, which does not copy rb_node + * pointers, so we have to do it ourselves. Otherwise we may + * be bitten by crashes. + */ + for (i = 0; i < ulist->nnodes; i++) { + ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]); + if (ret < 0) + return ret; + } } ulist->nodes[ulist->nnodes].val = val; ulist->nodes[ulist->nnodes].aux = aux; diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h deleted file mode 100644 index 9bf3946..0000000 --- a/fs/btrfs/version.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __BTRFS_VERSION_H -#define __BTRFS_VERSION_H -#define BTRFS_BUILD_VERSION "Btrfs" -#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0e925ce..78b8717 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -982,6 +982,35 @@ out: return ret; } +static int contains_pending_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 *start, u64 len) +{ + struct extent_map *em; + int ret = 0; + + list_for_each_entry(em, &trans->transaction->pending_chunks, list) { + struct map_lookup *map; + int i; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev != device) + continue; + if (map->stripes[i].physical >= *start + len || + map->stripes[i].physical + em->orig_block_len <= + *start) + continue; + *start = map->stripes[i].physical + + em->orig_block_len; + ret = 1; + } + } + + return ret; +} + + /* * find_free_dev_extent - find free space in the specified device * @device: the device which we search the free space in @@ -1002,7 +1031,8 @@ out: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, */ search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: max_hole_start = search_start; max_hole_size = 0; hole_size = 0; if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; - goto error; + goto out; } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto error; - } path->reada = 2; + path->search_commit_root = 1; + path->skip_locking = 1; key.objectid = device->devid; key.offset = search_start; @@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, if (key.offset > search_start) { hole_size = key.offset - search_start; + /* + * Have to check before we set max_hole_start, otherwise + * we could end up sending back this offset anyway. + */ + if (contains_pending_extent(trans, device, + &search_start, + hole_size)) + hole_size = 0; + if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; @@ -1124,6 +1164,11 @@ next: max_hole_size = hole_size; } + if (contains_pending_extent(trans, device, &search_start, hole_size)) { + btrfs_release_path(path); + goto again; + } + /* See above. */ if (hole_size < num_bytes) ret = -ENOSPC; @@ -1132,7 +1177,6 @@ next: out: btrfs_free_path(path); -error: *start = max_hole_start; if (len) *len = max_hole_size; @@ -1244,47 +1288,22 @@ out: return ret; } -static noinline int find_next_chunk(struct btrfs_root *root, - u64 objectid, u64 *offset) +static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct btrfs_chunk *chunk; - struct btrfs_key found_key; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = objectid; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - - BUG_ON(ret == 0); /* Corruption */ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct rb_node *n; + u64 ret = 0; - ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) { - *offset = 0; - } else { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - if (found_key.objectid != objectid) - *offset = 0; - else { - chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_chunk); - *offset = found_key.offset + - btrfs_chunk_length(path->nodes[0], chunk); - } + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + n = rb_last(&em_tree->map); + if (n) { + em = rb_entry(n, struct extent_map, rb_node); + ret = em->start + em->len; } - ret = 0; -error: - btrfs_free_path(path); + read_unlock(&em_tree->lock); + return ret; } @@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) btrfs_dev_replace_unlock(&root->fs_info->dev_replace); if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - printk(KERN_ERR "btrfs: unable to go below four devices " - "on raid10\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid1\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && root->fs_info->fs_devices->rw_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid5\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; goto out; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && root->fs_info->fs_devices->rw_devices <= 3) { - printk(KERN_ERR "btrfs: unable to go below three " - "devices on raid6\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; goto out; } @@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bh = NULL; disk_super = NULL; if (!device) { - printk(KERN_ERR "btrfs: no missing devices found to " - "remove\n"); + ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; goto out; } } else { @@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if (device->is_tgtdev_for_dev_replace) { - pr_err("btrfs: unable to remove the dev_replace target dev\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto error_brelse; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { - printk(KERN_ERR "btrfs: unable to remove the only writeable " - "device\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto error_brelse; } @@ -3120,14 +3127,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; if (num_devices == 1) allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices < 4) + else if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - else - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6); - + if (num_devices > 2) + allowed |= BTRFS_BLOCK_GROUP_RAID5; + if (num_devices > 3) + allowed |= (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID6); if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || (bctl->data.target & ~allowed))) { @@ -3296,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); - if (IS_ERR(tsk)) - return PTR_ERR(tsk); - - return 0; + return PTR_RET(tsk); } int btrfs_recover_balance(struct btrfs_fs_info *fs_info) @@ -3682,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) } static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes_out, u64 *stripe_size_out, - u64 start, u64 type) + struct btrfs_root *extent_root, u64 start, + u64 type) { struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -3792,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(device, + ret = find_free_dev_extent(trans, device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -3904,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->type = type; map->sub_stripes = sub_stripes; - *map_ret = map; num_bytes = stripe_size * data_stripes; - *stripe_size_out = stripe_size; - *num_bytes_out = num_bytes; - trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); em = alloc_extent_map(); @@ -3922,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->len = num_bytes; em->block_start = 0; em->block_len = em->len; + em->orig_block_len = stripe_size; em_tree = &extent_root->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + if (!ret) { + list_add_tail(&em->list, &trans->transaction->pending_chunks); + atomic_inc(&em->refs); + } write_unlock(&em_tree->lock); if (ret) { free_extent_map(em); goto error; } - for (i = 0; i < map->num_stripes; ++i) { - struct btrfs_device *device; - u64 dev_offset; - - device = map->stripes[i].dev; - dev_offset = map->stripes[i].physical; - - ret = btrfs_alloc_dev_extent(trans, device, - info->chunk_root->root_key.objectid, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, dev_offset, stripe_size); - if (ret) - goto error_dev_extent; - } - ret = btrfs_make_block_group(trans, extent_root, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); - if (ret) { - i = map->num_stripes - 1; - goto error_dev_extent; - } + if (ret) + goto error_del_extent; free_extent_map(em); check_raid56_incompat_flag(extent_root->fs_info, type); @@ -3961,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, kfree(devices_info); return 0; -error_dev_extent: - for (; i >= 0; i--) { - struct btrfs_device *device; - int err; - - device = map->stripes[i].dev; - err = btrfs_free_dev_extent(trans, device, start); - if (err) { - btrfs_abort_transaction(trans, extent_root, err); - break; - } - } +error_del_extent: write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); @@ -3987,33 +3961,68 @@ error: return ret; } -static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, - struct map_lookup *map, u64 chunk_offset, - u64 chunk_size, u64 stripe_size) + u64 chunk_offset, u64 chunk_size) { - u64 dev_offset; struct btrfs_key key; struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - size_t item_size = btrfs_chunk_item_size(map->num_stripes); - int index = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct map_lookup *map; + size_t item_size; + u64 dev_offset; + u64 stripe_size; + int i = 0; int ret; + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(extent_root->fs_info, "unable to find logical " + "%Lu len %Lu", chunk_offset, chunk_size); + return -EINVAL; + } + + if (em->start != chunk_offset || em->len != chunk_size) { + btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" + " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, + chunk_size, em->start, em->len); + free_extent_map(em); + return -EINVAL; + } + + map = (struct map_lookup *)em->bdev; + item_size = btrfs_chunk_item_size(map->num_stripes); + stripe_size = em->orig_block_len; + chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) - return -ENOMEM; + if (!chunk) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; - index = 0; - while (index < map->num_stripes) { - device = map->stripes[index].dev; device->bytes_used += stripe_size; ret = btrfs_update_device(trans, device); if (ret) - goto out_free; - index++; + goto out; + ret = btrfs_alloc_dev_extent(trans, device, + chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + chunk_offset, dev_offset, + stripe_size); + if (ret) + goto out; } spin_lock(&extent_root->fs_info->free_chunk_lock); @@ -4021,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, map->num_stripes); spin_unlock(&extent_root->fs_info->free_chunk_lock); - index = 0; stripe = &chunk->stripe; - while (index < map->num_stripes) { - device = map->stripes[index].dev; - dev_offset = map->stripes[index].physical; + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; btrfs_set_stack_stripe_devid(stripe, device->devid); btrfs_set_stack_stripe_offset(stripe, dev_offset); memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; - index++; } btrfs_set_stack_chunk_length(chunk, chunk_size); @@ -4049,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, key.offset = chunk_offset; ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); - if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { /* * TODO: Cleanup of inserted chunk root in case of @@ -4059,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, item_size); } -out_free: +out: kfree(chunk); + free_extent_map(em); return ret; } @@ -4075,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 type) { u64 chunk_offset; - u64 chunk_size; - u64 stripe_size; - struct map_lookup *map; - struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; - int ret; - - ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, - &chunk_offset); - if (ret) - return ret; - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, type); - if (ret) - return ret; - - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - if (ret) - return ret; - return 0; + chunk_offset = find_next_chunk(extent_root->fs_info); + return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); } static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, @@ -4104,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, { u64 chunk_offset; u64 sys_chunk_offset; - u64 chunk_size; - u64 sys_chunk_size; - u64 stripe_size; - u64 sys_stripe_size; u64 alloc_profile; - struct map_lookup *map; - struct map_lookup *sys_map; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; int ret; - ret = find_next_chunk(fs_info->chunk_root, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); - if (ret) - return ret; - + chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_get_alloc_profile(extent_root, 0); - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, alloc_profile); + ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, + alloc_profile); if (ret) return ret; - sys_chunk_offset = chunk_offset + chunk_size; - + sys_chunk_offset = find_next_chunk(root->fs_info); alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); - ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, - &sys_chunk_size, &sys_stripe_size, - sys_chunk_offset, alloc_profile); + ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, + alloc_profile); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; } ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - /* - * Modifying chunk tree needs allocating new blocks from both - * system block group and metadata block group. So we only can - * do operations require modifying the chunk tree after both - * block groups were created. - */ - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - ret = __finish_chunk_alloc(trans, extent_root, sys_map, - sys_chunk_offset, sys_chunk_size, - sys_stripe_size); if (ret) btrfs_abort_transaction(trans, root, ret); - out: - return ret; } @@ -4436,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; - if (mirror_num > map->num_stripes) - mirror_num = 0; - stripe_len = map->stripe_len; stripe_nr = offset; /* @@ -5019,42 +4970,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void *merge_stripe_index_into_bio_private(void *bi_private, - unsigned int stripe_index) -{ - /* - * with single, dup, RAID0, RAID1 and RAID10, stripe_index is - * at most 1. - * The alternative solution (instead of stealing bits from the - * pointer) would be to allocate an intermediate structure - * that contains the old private pointer plus the stripe_index. - */ - BUG_ON((((uintptr_t)bi_private) & 3) != 0); - BUG_ON(stripe_index > 3); - return (void *)(((uintptr_t)bi_private) | stripe_index); -} - -static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) -{ - return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); -} - -static unsigned int extract_stripe_index_from_bio_private(void *bi_private) -{ - return (unsigned int)((uintptr_t)bi_private) & 3; -} - static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) { atomic_inc(&bbio->error); if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = - extract_stripe_index_from_bio_private( - bio->bi_private); + btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); @@ -5084,8 +5009,7 @@ static void btrfs_end_bio(struct bio *bio, int err) } bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio */ @@ -5211,8 +5135,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct btrfs_device *dev = bbio->stripes[dev_nr].dev; bio->bi_private = bbio; - bio->bi_private = merge_stripe_index_into_bio_private( - bio->bi_private, (unsigned int)dev_nr); + btrfs_io_bio(bio)->stripe_index = dev_nr; bio->bi_end_io = btrfs_end_bio; bio->bi_sector = physical >> 9; #ifdef DEBUG @@ -5273,8 +5196,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) if (atomic_dec_and_test(&bbio->stripes_pending)) { bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; + btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_sector = logical >> 9; kfree(bbio); bio_endio(bio, -EIO); @@ -5352,7 +5274,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); + bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; @@ -5397,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, return NULL; list_add(&device->dev_list, &fs_devices->devices); - device->dev_root = root->fs_info->dev_root; device->devid = devid; device->work.func = pending_bios_fn; device->fs_devices = fs_devices; @@ -5623,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root, } fill_device_from_item(leaf, dev_item, device); - device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; @@ -5781,6 +5701,17 @@ error: return ret; } +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); +} + static void __btrfs_reset_dev_stats(struct btrfs_device *dev) { int i; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 845ccbb..8670558 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -152,6 +152,26 @@ struct btrfs_fs_devices { int rotating; }; +/* + * we need the mirror number and stripe index to be passed around + * the call chain while we are processing end_io (especially errors). + * Really, what we need is a btrfs_bio structure that has this info + * and is properly sized with its stripe array, but we're not there + * quite yet. We have our own btrfs bioset, and all of the bios + * we allocate are actually btrfs_io_bios. We'll cram as much of + * struct btrfs_bio as we can into this over time. + */ +struct btrfs_io_bio { + unsigned long mirror_num; + unsigned long stripe_index; + struct bio bio; +}; + +static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_io_bio, bio); +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; @@ -296,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_root *root, struct btrfs_ioctl_get_dev_stats *stats); +void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); @@ -316,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, unsigned long btrfs_full_stripe_len(struct btrfs_root *root, struct btrfs_mapping_tree *map_tree, u64 logical); +int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + u64 chunk_offset, u64 chunk_size); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { diff --git a/fs/buffer.c b/fs/buffer.c index d2a4d1b..4d74335 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -83,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh) EXPORT_SYMBOL(unlock_buffer); /* + * Returns if the page has dirty or writeback buffers. If all the buffers + * are unlocked and clean then the PageDirty information is stale. If + * any of the pages are locked, it is assumed they are locked for IO. + */ +void buffer_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct buffer_head *head, *bh; + *dirty = false; + *writeback = false; + + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + return; + + if (PageWriteback(page)) + *writeback = true; + + head = page_buffers(page); + bh = head; + do { + if (buffer_locked(bh)) + *writeback = true; + + if (buffer_dirty(bh)) + *dirty = true; + + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(buffer_check_dirty_writeback); + +/* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself * if you want to preserve its state. @@ -1454,7 +1488,8 @@ static void discard_buffer(struct buffer_head * bh) * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected - * @offset: the index of the truncation point + * @offset: start of the range to invalidate + * @length: length of the range to invalidate * * block_invalidatepage() is called when all or part of the page has become * invalidated by a truncate operation. @@ -1465,15 +1500,22 @@ static void discard_buffer(struct buffer_head * bh) * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void block_invalidatepage(struct page *page, unsigned long offset) +void block_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; + unsigned int stop = length + offset; BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) goto out; + /* + * Check for overflow + */ + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + head = page_buffers(page); bh = head; do { @@ -1481,6 +1523,12 @@ void block_invalidatepage(struct page *page, unsigned long offset) next = bh->b_this_page; /* + * Are we still fully in range ? + */ + if (next_off > stop) + goto out; + + /* * is this block fully invalidated? */ if (offset <= curr_off) @@ -1501,6 +1549,7 @@ out: } EXPORT_SYMBOL(block_invalidatepage); + /* * We attach and possibly dirty the buffers atomically wrt * __set_page_dirty_buffers() via private_lock. try_to_free_buffers @@ -2841,7 +2890,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, * they may have been added in ext3_writepage(). Make them * freeable here, so the page does not leak. */ - do_invalidatepage(page, 0); + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); return 0; /* don't care */ } diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 746ce53..d4c1206 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -13,8 +13,6 @@ #include <linux/mount.h> #include "internal.h" -#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) - struct cachefiles_lookup_data { struct cachefiles_xattr *auxdata; /* auxiliary data */ char *key; /* key path */ @@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object) object = container_of(_object, struct cachefiles_object, fscache); cache = container_of(object->fscache.cache, struct cachefiles_cache, cache); + + if (!fscache_use_cookie(_object)) { + _leave(" [relinq]"); + return; + } + cookie = object->fscache.cookie; if (!cookie->def->get_aux) { + fscache_unuse_cookie(_object); _leave(" [no aux]"); return; } auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp); if (!auxdata) { + fscache_unuse_cookie(_object); _leave(" [nomem]"); return; } auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); + fscache_unuse_cookie(_object); ASSERTCMP(auxlen, <, 511); auxdata->len = auxlen + 1; @@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) #endif /* delete retired objects */ - if (object->fscache.state == FSCACHE_OBJECT_RECYCLING && + if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) && _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 8c01c5fc..25badd1 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object, printk(KERN_ERR "%sobject: OBJ%x\n", prefix, object->fscache.debug_id); printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", - prefix, fscache_object_states[object->fscache.state], + prefix, object->fscache.state->name, object->fscache.flags, work_busy(&object->fscache.work), object->fscache.events, object->fscache.event_mask); printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", @@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, found_dentry: kdebug("preemptive burial: OBJ%x [%s] %p", object->fscache.debug_id, - fscache_object_states[object->fscache.state], + object->fscache.state->name, dentry); - if (object->fscache.state < FSCACHE_OBJECT_DYING) { + if (fscache_object_is_live(&object->fscache)) { printk(KERN_ERR "\n"); printk(KERN_ERR "CacheFiles: Error:" " Can't preemptively bury live object\n"); @@ -192,7 +192,7 @@ try_again: /* an old object from a previous incarnation is hogging the slot - we * need to wait for it to be destroyed */ wait_for_old_object: - if (xobject->fscache.state < FSCACHE_OBJECT_DYING) { + if (fscache_object_is_live(&object->fscache)) { printk(KERN_ERR "\n"); printk(KERN_ERR "CacheFiles: Error:" " Unexpected object collision\n"); @@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); /* look up the victim */ - mutex_lock_nested(&dir->d_inode->i_mutex, 1); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 317f9ee..ebaff36 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -12,6 +12,7 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/swap.h> #include "internal.h" /* @@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) */ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, struct fscache_retrieval *op, - struct page *netpage, - struct pagevec *pagevec) + struct page *netpage) { struct cachefiles_one_read *monitor; struct address_space *bmapping; @@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, _enter(""); - pagevec_reinit(pagevec); - _debug("read back %p{%lu,%d}", netpage, netpage->index, page_count(netpage)); @@ -283,9 +281,7 @@ installed_new_backing_page: backpage = newpage; newpage = NULL; - page_cache_get(backpage); - pagevec_add(pagevec, backpage); - __pagevec_lru_add_file(pagevec); + lru_cache_add_file(backpage); read_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); @@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, if (block) { /* submit the apparently valid page to the backing fs to be * read from disk */ - ret = cachefiles_read_backing_file_one(object, op, page, - &pagevec); + ret = cachefiles_read_backing_file_one(object, op, page); } else if (cachefiles_has_space(cache, 0, 1) == 0) { /* there's space in the cache we can use */ fscache_mark_page_cached(op, page); @@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, { struct cachefiles_one_read *monitor = NULL; struct address_space *bmapping = object->backer->d_inode->i_mapping; - struct pagevec lru_pvec; struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; int ret = 0; _enter(""); - pagevec_init(&lru_pvec, 0); - list_for_each_entry_safe(netpage, _n, list, lru) { list_del(&netpage->lru); @@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, backpage = newpage; newpage = NULL; - page_cache_get(backpage); - if (!pagevec_add(&lru_pvec, backpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(backpage); reread_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); @@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - page_cache_get(netpage); - if (!pagevec_add(&lru_pvec, netpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(netpage); /* install a monitor */ page_cache_get(netpage); @@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, fscache_mark_page_cached(op, netpage); - page_cache_get(netpage); - if (!pagevec_add(&lru_pvec, netpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(netpage); /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); @@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, out: /* tidy up */ - pagevec_lru_add_file(&lru_pvec); - if (newpage) page_cache_release(newpage); if (netpage) diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 73b4628..2476e51 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object, struct dentry *dentry = object->dentry; int ret; - ASSERT(object->fscache.cookie); ASSERT(dentry); _enter("%p,#%d", object, auxdata->len); /* attempt to install the cache metadata directly */ - _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + _debug("SET #%u", auxdata->len); ret = vfs_setxattr(dentry, cachefiles_xattr_cache, &auxdata->type, auxdata->len, @@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, struct dentry *dentry = object->dentry; int ret; - ASSERT(object->fscache.cookie); ASSERT(dentry); _enter("%p,#%d", object, auxdata->len); /* attempt to install the cache metadata directly */ - _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); + _debug("SET #%u", auxdata->len); ret = vfs_setxattr(dentry, cachefiles_xattr_cache, &auxdata->type, auxdata->len, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3e68ac1..5318a3b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page) * dirty page counters appropriately. Only called if there is private * data on the page. */ -static void ceph_invalidatepage(struct page *page, unsigned long offset) +static void ceph_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode; struct ceph_inode_info *ci; @@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset) if (!PageDirty(page)) pr_err("%p invalidatepage %p page not dirty\n", inode, page); - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); ci = ceph_inode(inode); - if (offset == 0) { - dout("%p invalidatepage %p idx %lu full dirty page %lu\n", - inode, page, page->index, offset); + if (offset == 0 && length == PAGE_CACHE_SIZE) { + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); page->private = 0; ClearPagePrivate(page); } else { - dout("%p invalidatepage %p idx %lu partial dirty page\n", - inode, page, page->index); + dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n", + inode, page, page->index, offset, length); } } @@ -438,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_inode_info *ci; struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; - loff_t page_off = page_offset(page); - int len = PAGE_CACHE_SIZE; - loff_t i_size; - int err = 0; struct ceph_snap_context *snapc, *oldest; - u64 snap_size = 0; + loff_t page_off = page_offset(page); long writeback_stat; + u64 truncate_size, snap_size = 0; + u32 truncate_seq; + int err = 0, len = PAGE_CACHE_SIZE; dout("writepage %p idx %lu\n", page, page->index); @@ -474,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + /* is this a partial page at end of file? */ - if (snap_size) - i_size = snap_size; - else - i_size = i_size_read(inode); - if (i_size < page_off + len) - len = i_size - page_off; + if (page_off >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); + goto out; + } + if (snap_size < page_off + len) + len = snap_size - page_off; dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", inode, page, page->index, page_off, len, snapc); @@ -494,7 +501,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, - ci->i_truncate_seq, ci->i_truncate_size, + truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { dout("writepage setting page/mapping error %d %p\n", err, page); @@ -631,25 +638,6 @@ static void writepages_finish(struct ceph_osd_request *req, ceph_osdc_put_request(req); } -static struct ceph_osd_request * -ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len, - struct ceph_snap_context *snapc, int num_ops) -{ - struct ceph_fs_client *fsc; - struct ceph_inode_info *ci; - struct ceph_vino vino; - - fsc = ceph_inode_to_client(inode); - ci = ceph_inode(inode); - vino = ceph_vino(inode); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - - return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, offset, len, num_ops, CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK, - snapc, ci->i_truncate_seq, ci->i_truncate_size, true); -} - /* * initiate async writeback */ @@ -658,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping, { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -670,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size; + u64 truncate_size, snap_size; + u32 truncate_seq; /* * Include a 'sync' in the OSD request if this is a data * integrity write (e.g., O_SYNC write or fsync()), or if our * cap is being revoked. */ - do_sync = wbc->sync_mode == WB_SYNC_ALL; - if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + if ((wbc->sync_mode == WB_SYNC_ALL) || + ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) do_sync = 1; dout("writepages_start %p dosync=%d (mode=%s)\n", inode, do_sync, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - fsc = ceph_inode_to_client(inode); if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { pr_warning("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ @@ -728,6 +717,14 @@ retry: snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); + + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + if (last_snapc && snapc != last_snapc) { /* if we switched to a newer snapc, restart our scan at the * start of the original file range. */ @@ -739,7 +736,6 @@ retry: while (!done && index <= end) { int num_ops = do_sync ? 2 : 1; - struct ceph_vino vino; unsigned i; int first; pgoff_t next; @@ -833,17 +829,18 @@ get_more_pages: * that it will use. */ if (locked_pages == 0) { - size_t size; - BUG_ON(pages); - /* prepare async write request */ offset = (u64)page_offset(page); len = wsize; - req = ceph_writepages_osd_request(inode, - offset, &len, snapc, - num_ops); - + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, num_ops, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, true); if (IS_ERR(req)) { rc = PTR_ERR(req); unlock_page(page); @@ -854,8 +851,8 @@ get_more_pages: req->r_inode = inode; max_pages = calc_pages_for(0, (u64)len); - size = max_pages * sizeof (*pages); - pages = kmalloc(size, GFP_NOFS); + pages = kmalloc(max_pages * sizeof (*pages), + GFP_NOFS); if (!pages) { pool = fsc->wb_pagevec_pool; pages = mempool_alloc(pool, GFP_NOFS); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index da0f9b8..25442b4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_mds_client *mdsc, +void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { int i; @@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, int have; int alloc = 0; LIST_HEAD(newcaps); - int ret = 0; dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - ret = -ENOMEM; - goto out_alloc_count; - } + if (!cap) + break; list_add(&cap->caps_item, &newcaps); alloc++; } - BUG_ON(have + alloc != need); + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); - return 0; - -out_alloc_count: - /* we didn't manage to reserve as much as we needed */ - pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have); - return ret; } int ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -612,9 +605,11 @@ retry: __cap_delay_requeue(mdsc, ci); } - if (flags & CEPH_CAP_FLAG_AUTH) - ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) { + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + ci->i_auth_cap = cap; + } else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) { @@ -695,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (implemented) *implemented |= cap->implemented; } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } return have; } @@ -802,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* * Return true if mask caps are currently being revoked by an MDS. */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) { - struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct rb_node *p; - int ret = 0; - spin_lock(&ci->i_ceph_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (__cap_is_valid(cap) && - (cap->implemented & ~cap->issued & mask)) { - ret = 1; - break; - } + if (cap != ocap && __cap_is_valid(cap) && + (cap->implemented & ~cap->issued & mask)) + return 1; } + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); @@ -1980,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, __ceph_caps_used(ci), __ceph_caps_wanted(ci), @@ -2055,7 +2072,11 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); - __ceph_do_pending_vmtruncate(inode, !(need & CEPH_CAP_FILE_WR)); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); } @@ -2473,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } else { dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a @@ -3042,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, (cap->issued & unless) == 0)) { if ((cap->issued & drop) && (cap->issued & unless) == 0) { - dout("encode_inode_release %p cap %p %s -> " - "%s\n", inode, cap, + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop)); + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + cap->issued &= ~drop; cap->implemented &= ~drop; - if (ci->i_ceph_flags & CEPH_I_NODELAY) { - int wanted = __ceph_caps_wanted(ci); - dout(" wanted %s -> %s (act %s)\n", - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(cap->mds_wanted & - ~wanted), - ceph_cap_string(wanted)); - cap->mds_wanted &= wanted; - } + cap->mds_wanted = wanted; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f02d82b..a40ceda 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -111,11 +111,10 @@ static unsigned fpos_off(loff_t p) * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ -static int __dcache_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int __dcache_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = filp->private_data; - struct dentry *parent = filp->f_dentry; + struct ceph_file_info *fi = file->private_data; + struct dentry *parent = file->f_dentry; struct inode *dir = parent->d_inode; struct list_head *p; struct dentry *dentry, *last; @@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp, last = fi->dentry; fi->dentry = NULL; - dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, + dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, last); spin_lock(&parent->d_lock); /* start at beginning? */ - if (filp->f_pos == 2 || last == NULL || - filp->f_pos < ceph_dentry(last)->offset) { + if (ctx->pos == 2 || last == NULL || + ctx->pos < ceph_dentry(last)->offset) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -157,11 +156,11 @@ more: if (!d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && - filp->f_pos <= di->offset) + ctx->pos <= di->offset) break; dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, dentry->d_name.len, dentry->d_name.name, di->offset, - filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", + ctx->pos, d_unhashed(dentry) ? " unhashed" : "", !dentry->d_inode ? " null" : ""); spin_unlock(&dentry->d_lock); p = p->prev; @@ -173,29 +172,27 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - filp->f_pos = di->offset; - err = filldir(dirent, dentry->d_name.name, - dentry->d_name.len, di->offset, + ctx->pos = di->offset; + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), - dentry->d_inode->i_mode >> 12); - - if (last) { - if (err < 0) { + dentry->d_inode->i_mode >> 12)) { + if (last) { /* remember our position */ fi->dentry = last; fi->next_offset = di->offset; - } else { - dput(last); } + dput(dentry); + return 0; } - last = dentry; - if (err < 0) - goto out; + if (last) + dput(last); + last = dentry; - filp->f_pos++; + ctx->pos++; /* make sure a dentry wasn't dropped while we didn't have parent lock */ if (!ceph_dir_is_complete(dir)) { @@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } -static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ceph_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = filp->private_data; - struct inode *inode = file_inode(filp); + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(filp->f_pos); - int off = fpos_off(filp->f_pos); + unsigned frag = fpos_frag(ctx->pos); + int off = fpos_off(ctx->pos); int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; const int max_entries = fsc->mount_options->max_readdir; const int max_bytes = fsc->mount_options->max_readdir_bytes; - dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); + dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); if (fi->flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ - if (filp->f_pos == 0) { + if (ctx->pos == 0) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ fi->dir_release_count = atomic_read(&ci->i_release_count); dout("readdir off 0 -> '.'\n"); - if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), + if (!dir_emit(ctx, ".", 1, ceph_translate_ino(inode->i_sb, inode->i_ino), - inode->i_mode >> 12) < 0) + inode->i_mode >> 12)) return 0; - filp->f_pos = 1; + ctx->pos = 1; off = 1; } - if (filp->f_pos == 1) { - ino_t ino = parent_ino(filp->f_dentry); + if (ctx->pos == 1) { + ino_t ino = parent_ino(file->f_dentry); dout("readdir off 1 -> '..'\n"); - if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), + if (!dir_emit(ctx, "..", 2, ceph_translate_ino(inode->i_sb, ino), - inode->i_mode >> 12) < 0) + inode->i_mode >> 12)) return 0; - filp->f_pos = 2; + ctx->pos = 2; off = 2; } /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); - if ((filp->f_pos == 2 || fi->dentry) && + if ((ctx->pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); - err = __dcache_readdir(filp, dirent, filldir); + err = __dcache_readdir(file, ctx); if (err != -EAGAIN) return err; } else { @@ -327,7 +324,7 @@ more: return PTR_ERR(req); req->r_inode = inode; ihold(inode); - req->r_dentry = dget(filp->f_dentry); + req->r_dentry = dget(file->f_dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); @@ -379,15 +376,16 @@ more: rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d off %d chunkoff %d\n", frag, rinfo->dir_nr, off, fi->offset); + + ctx->pos = ceph_make_fpos(frag, off); while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; struct ceph_vino vino; ino_t ino; dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, pos, + off, off - fi->offset, rinfo->dir_nr, ctx->pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); @@ -395,16 +393,15 @@ more: vino.ino = le64_to_cpu(in->ino); vino.snap = le64_to_cpu(in->snapid); ino = ceph_vino_to_ino(vino); - if (filldir(dirent, + if (!dir_emit(ctx, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } off++; - filp->f_pos = pos + 1; + ctx->pos++; } if (fi->last_name) { @@ -417,7 +414,7 @@ more: if (!ceph_frag_is_rightmost(frag)) { frag = ceph_frag_next(frag); off = 0; - filp->f_pos = ceph_make_fpos(frag, off); + ctx->pos = ceph_make_fpos(frag, off); dout("readdir next frag is %x\n", frag); goto more; } @@ -432,11 +429,11 @@ more: if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = filp->f_pos; + ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); - dout("readdir %p filp %p done.\n", inode, filp); + dout("readdir %p file %p done.\n", inode, file); return 0; } @@ -1268,7 +1265,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, - .readdir = ceph_readdir, + .iterate = ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 656e169..2ddf061 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -716,7 +716,6 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); hold_mutex = true; @@ -809,7 +808,6 @@ retry_snap: out: if (hold_mutex) mutex_unlock(&inode->i_mutex); - sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return written ? written : err; @@ -824,7 +822,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); @@ -866,16 +864,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) break; } - if (offset < 0 || offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; - goto out; - } - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index be0f7e2..f3a2abf 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -903,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } else if (realdn) { dout("dn %p (%d) spliced with %p (%d) " "inode %p ino %llx.%llx\n", - dn, dn->d_count, - realdn, realdn->d_count, + dn, d_count(dn), + realdn, d_count(realdn), realdn->d_inode, ceph_vinop(realdn->d_inode)); dput(dn); dn = realdn; @@ -1465,7 +1465,9 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - __ceph_do_pending_vmtruncate(inode, true); + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + mutex_unlock(&inode->i_mutex); iput(inode); } @@ -1492,7 +1494,7 @@ void ceph_queue_vmtruncate(struct inode *inode) * Make sure any pending truncation is applied before doing anything * that may depend on it. */ -void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock) +void __ceph_do_pending_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; @@ -1525,11 +1527,7 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); - if (needlock) - mutex_lock(&inode->i_mutex); truncate_inode_pages(inode->i_mapping, to); - if (needlock) - mutex_unlock(&inode->i_mutex); spin_lock(&ci->i_ceph_lock); if (to == ci->i_truncate_size) { @@ -1588,7 +1586,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); err = inode_change_ok(inode, attr); if (err != 0) @@ -1770,7 +1768,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode, false); + __ceph_do_pending_vmtruncate(inode); return err; out: spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 202dd3d6..ae6d14e 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) } /** - * Must be called with BKL already held. Fills in the passed + * Must be called with lock_flocks() already held. Fills in the passed * counter variables, so you can prepare pagelist metadata before calling * ceph_encode_locks. */ @@ -191,27 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) } /** - * Encode the flock and fcntl locks for the given inode into the pagelist. - * Format is: #fcntl locks, sequential fcntl locks, #flock locks, - * sequential flock locks. - * Must be called with lock_flocks() already held. - * If we encounter more of a specific lock type than expected, - * we return the value 1. + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with inode->i_lock already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC. */ -int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, - int num_fcntl_locks, int num_flock_locks) +int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; - struct ceph_filelock cephlock; int err = 0; int seen_fcntl = 0; int seen_flock = 0; + int l = 0; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); - err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); - if (err) - goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_POSIX) { ++seen_fcntl; @@ -219,19 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } - - err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); - if (err) - goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_FLOCK) { ++seen_flock; @@ -239,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } fail: return err; } +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + int err = 0; + __le32 nlocks; + + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, + &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); +out_fail: + return err; +} + /* * Given a pointer to a lock, convert it to a ceph filelock */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4f22671..187bf21 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1391,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, num = le32_to_cpu(head->num); dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); session->s_num_cap_releases += num; /* requeue completed messages */ @@ -1553,7 +1554,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", - dentry, dentry->d_count, *base, len, path); + dentry, d_count(dentry), *base, len, path); return path; } @@ -2454,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ + cap->mseq = 0; /* and migrate_seq */ if (recon_state->flock) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); @@ -2478,39 +2480,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; - struct ceph_pagelist_cursor trunc_point; - - ceph_pagelist_set_cursor(pagelist, &trunc_point); - do { - lock_flocks(); - ceph_count_locks(inode, &num_fcntl_locks, - &num_flock_locks); - rec.v2.flock_len = (2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - unlock_flocks(); - - /* pre-alloc pagelist */ - ceph_pagelist_truncate(pagelist, &trunc_point); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_pagelist_reserve(pagelist, - rec.v2.flock_len); - - /* encode locks */ - if (!err) { - lock_flocks(); - err = ceph_encode_locks(inode, - pagelist, - num_fcntl_locks, - num_flock_locks); - unlock_flocks(); - } - } while (err == -ENOSPC); + struct ceph_filelock *flocks; + +encode_again: + spin_lock(&inode->i_lock); + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + spin_unlock(&inode->i_lock); + flocks = kmalloc((num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + spin_lock(&inode->i_lock); + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + spin_unlock(&inode->i_lock); + if (err) { + kfree(flocks); + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + /* + * number of encoded locks is stable, so copy to pagelist + */ + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + kfree(flocks); } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } - out_free: kfree(path); out_dput: @@ -3035,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) + if (mdsc->mdsmap == NULL) { + kfree(mdsc); return -ENOMEM; + } init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 9278dec..132b64e 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -92,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; + struct ceph_mds_info *info; ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); global_id = ceph_decode_64(p); @@ -126,24 +127,27 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds >= 0 && mds < m->m_max_mds && state > 0) { - m->m_info[mds].global_id = global_id; - m->m_info[mds].state = state; - m->m_info[mds].addr = addr; - m->m_info[mds].laggy = - (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); - m->m_info[mds].num_export_targets = num_export_targets; - if (num_export_targets) { - m->m_info[mds].export_targets = - kcalloc(num_export_targets, sizeof(u32), - GFP_NOFS); - for (j = 0; j < num_export_targets; j++) - m->m_info[mds].export_targets[j] = - ceph_decode_32(&pexport_targets); - } else { - m->m_info[mds].export_targets = NULL; - } + + if (mds < 0 || mds >= m->m_max_mds || state <= 0) + continue; + + info = &m->m_info[mds]; + info->global_id = global_id; + info->state = state; + info->addr = addr; + info->laggy = (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); + info->num_export_targets = num_export_targets; + if (num_export_targets) { + info->export_targets = kcalloc(num_export_targets, + sizeof(u32), GFP_NOFS); + if (info->export_targets == NULL) + goto badmem; + for (j = 0; j < num_export_targets; j++) + info->export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + info->export_targets = NULL; } } @@ -170,7 +174,7 @@ bad: DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); ceph_mdsmap_destroy(m); - return ERR_PTR(-EINVAL); + return ERR_PTR(err); } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7d377c9..6627b26 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ - if (*dev_name_end != ':') { + if (dev_name_end < dev_name || *dev_name_end != ':') { pr_err("device name is missing path (no : separator in %s)\n", dev_name); goto out; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8696be2f..cbded57 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -534,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); -extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); @@ -692,7 +692,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern int ceph_inode_set_size(struct inode *inode, loff_t size); -extern void __ceph_do_pending_vmtruncate(struct inode *inode, bool needlock); +extern void __ceph_do_pending_vmtruncate(struct inode *inode); extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); @@ -822,8 +822,13 @@ extern const struct export_operations ceph_export_ops; extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); -extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, - int p_locks, int f_locks); +extern int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, + int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 9b6b2b6d..be661d8 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -675,17 +675,18 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - spin_lock(&ci->i_ceph_lock); - dout("getxattr %p ver=%lld index_ver=%lld\n", inode, - ci->i_xattrs.version, ci->i_xattrs.index_version); /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { err = vxattr->getxattr_cb(ci, value, size); - goto out; + return err; } + spin_lock(&ci->i_ceph_lock); + dout("getxattr %p ver=%lld index_ver=%lld\n", inode, + ci->i_xattrs.version, ci->i_xattrs.index_version); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 2906ee2..603f18a 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -10,6 +10,7 @@ config CIFS select CRYPTO_ECB select CRYPTO_DES select CRYPTO_SHA256 + select CRYPTO_CMAC help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index d597483..f3ac415 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) tcon->nativeFileSystem); } seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" - "\nPathComponentMax: %d Status: 0x%d", + "\n\tPathComponentMax: %d Status: 0x%d", le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), le32_to_cpu(tcon->fsAttrInfo.Attributes), le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), @@ -224,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_puts(m, " type: CDROM "); else seq_printf(m, " type: %d ", dev_type); + if (server->ops->dump_share_caps) + server->ops->dump_share_caps(m, tcon); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); @@ -595,9 +597,36 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file) return single_open(file, cifs_security_flags_proc_show, NULL); } +/* + * Ensure that if someone sets a MUST flag, that we disable all other MAY + * flags except for the ones corresponding to the given MUST flag. If there are + * multiple MUST flags, then try to prefer more secure ones. + */ +static void +cifs_security_flags_handle_must_flags(unsigned int *flags) +{ + unsigned int signflags = *flags & CIFSSEC_MUST_SIGN; + + if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) + *flags = CIFSSEC_MUST_KRB5; + else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) + *flags = CIFSSEC_MUST_NTLMSSP; + else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) + *flags = CIFSSEC_MUST_NTLMV2; + else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM) + *flags = CIFSSEC_MUST_NTLM; + else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN) + *flags = CIFSSEC_MUST_LANMAN; + else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT) + *flags = CIFSSEC_MUST_PLNTXT; + + *flags |= signflags; +} + static ssize_t cifs_security_flags_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { + int rc; unsigned int flags; char flags_string[12]; char c; @@ -620,26 +649,35 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, global_secflags = CIFSSEC_MAX; return count; } else if (!isdigit(c)) { - cifs_dbg(VFS, "invalid flag %c\n", c); + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", + flags_string); return -EINVAL; } } - /* else we have a number */ - flags = simple_strtoul(flags_string, NULL, 0); + /* else we have a number */ + rc = kstrtouint(flags_string, 0, &flags); + if (rc) { + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", + flags_string); + return rc; + } cifs_dbg(FYI, "sec flags 0x%x\n", flags); - if (flags <= 0) { - cifs_dbg(VFS, "invalid security flags %s\n", flags_string); + if (flags == 0) { + cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string); return -EINVAL; } if (flags & ~CIFSSEC_MASK) { - cifs_dbg(VFS, "attempt to set unsupported security flags 0x%x\n", + cifs_dbg(VFS, "Unsupported security flags: 0x%x\n", flags & ~CIFSSEC_MASK); return -EINVAL; } + + cifs_security_flags_handle_must_flags(&flags); + /* flags look ok - update the global security flags for cifs module */ global_secflags = flags; if (global_secflags & CIFSSEC_MUST_SIGN) { diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 8e33ec6..58df174 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/vfs.h> #include <linux/fs.h> +#include <linux/inet.h> #include "cifsglob.h" #include "cifsproto.h" #include "cifsfs.h" @@ -48,58 +49,74 @@ void cifs_dfs_release_automount_timer(void) } /** - * cifs_get_share_name - extracts share name from UNC - * @node_name: pointer to UNC string + * cifs_build_devname - build a devicename from a UNC and optional prepath + * @nodename: pointer to UNC string + * @prepath: pointer to prefixpath (or NULL if there isn't one) * - * Extracts sharename form full UNC. - * i.e. strips from UNC trailing path that is not part of share - * name and fixup missing '\' in the beginning of DFS node refferal - * if necessary. - * Returns pointer to share name on success or ERR_PTR on error. - * Caller is responsible for freeing returned string. + * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer + * big enough to hold the final thing. Copy the UNC from the nodename, and + * concatenate the prepath onto the end of it if there is one. + * + * Returns pointer to the built string, or a ERR_PTR. Caller is responsible + * for freeing the returned string. */ -static char *cifs_get_share_name(const char *node_name) +static char * +cifs_build_devname(char *nodename, const char *prepath) { - int len; - char *UNC; - char *pSep; - - len = strlen(node_name); - UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, - GFP_KERNEL); - if (!UNC) - return ERR_PTR(-ENOMEM); + size_t pplen; + size_t unclen; + char *dev; + char *pos; + + /* skip over any preceding delimiters */ + nodename += strspn(nodename, "\\"); + if (!*nodename) + return ERR_PTR(-EINVAL); - /* get share name and server name */ - if (node_name[1] != '\\') { - UNC[0] = '\\'; - strncpy(UNC+1, node_name, len); - len++; - UNC[len] = 0; - } else { - strncpy(UNC, node_name, len); - UNC[len] = 0; - } + /* get length of UNC and set pos to last char */ + unclen = strlen(nodename); + pos = nodename + unclen - 1; - /* find server name end */ - pSep = memchr(UNC+2, '\\', len-2); - if (!pSep) { - cifs_dbg(VFS, "%s: no server name end in node name: %s\n", - __func__, node_name); - kfree(UNC); - return ERR_PTR(-EINVAL); + /* trim off any trailing delimiters */ + while (*pos == '\\') { + --pos; + --unclen; } - /* find sharename end */ - pSep++; - pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC)); - if (pSep) { - /* trim path up to sharename end - * now we have share name in UNC */ - *pSep = 0; + /* allocate a buffer: + * +2 for preceding "//" + * +1 for delimiter between UNC and prepath + * +1 for trailing NULL + */ + pplen = prepath ? strlen(prepath) : 0; + dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + pos = dev; + /* add the initial "//" */ + *pos = '/'; + ++pos; + *pos = '/'; + ++pos; + + /* copy in the UNC portion from referral */ + memcpy(pos, nodename, unclen); + pos += unclen; + + /* copy the prefixpath remainder (if there is one) */ + if (pplen) { + *pos = '/'; + ++pos; + memcpy(pos, prepath, pplen); + pos += pplen; } - return UNC; + /* NULL terminator */ + *pos = '\0'; + + convert_delimiter(dev, '/'); + return dev; } @@ -123,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, { int rc; char *mountdata = NULL; + const char *prepath = NULL; int md_len; char *tkn_e; char *srvIP = NULL; @@ -132,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (sb_mountdata == NULL) return ERR_PTR(-EINVAL); - *devname = cifs_get_share_name(ref->node_name); + if (strlen(fullpath) - ref->path_consumed) + prepath = fullpath + ref->path_consumed; + + *devname = cifs_build_devname(ref->node_name, prepath); if (IS_ERR(*devname)) { rc = PTR_ERR(*devname); *devname = NULL; @@ -146,12 +167,14 @@ char *cifs_compose_mount_options(const char *sb_mountdata, goto compose_mount_options_err; } - /* md_len = strlen(...) + 12 for 'sep+prefixpath=' - * assuming that we have 'unc=' and 'ip=' in - * the original sb_mountdata + /* + * In most cases, we'll be building a shorter string than the original, + * but we do have to assume that the address in the ip= option may be + * much longer than the original. Add the max length of an address + * string to the length of the original string to allow for worst case. */ - md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; - mountdata = kzalloc(md_len+1, GFP_KERNEL); + md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN; + mountdata = kzalloc(md_len + 1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; goto compose_mount_options_err; @@ -195,26 +218,6 @@ char *cifs_compose_mount_options(const char *sb_mountdata, strncat(mountdata, &sep, 1); strcat(mountdata, "ip="); strcat(mountdata, srvIP); - strncat(mountdata, &sep, 1); - strcat(mountdata, "unc="); - strcat(mountdata, *devname); - - /* find & copy prefixpath */ - tkn_e = strchr(ref->node_name + 2, '\\'); - if (tkn_e == NULL) { - /* invalid unc, missing share name*/ - rc = -EINVAL; - goto compose_mount_options_err; - } - - tkn_e = strchr(tkn_e + 1, '\\'); - if (tkn_e || (strlen(fullpath) - ref->path_consumed)) { - strncat(mountdata, &sep, 1); - strcat(mountdata, "prefixpath="); - if (tkn_e) - strcat(mountdata, tkn_e + 1); - strcat(mountdata, fullpath + ref->path_consumed); - } /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 4fb0974..fe8d627 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -327,14 +327,14 @@ UniToupper(register wchar_t uc) /* * UniStrupr: Upper case a unicode string */ -static inline wchar_t * -UniStrupr(register wchar_t *upin) +static inline __le16 * +UniStrupr(register __le16 *upin) { - register wchar_t *up; + register __le16 *up; up = upin; while (*up) { /* For all characters */ - *up = UniToupper(*up); + *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); up++; } return upin; /* Return input pointer */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 71436d1..3d8bf94 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -276,7 +276,6 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) { - memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); return 0; @@ -414,7 +413,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, int rc = 0; int len; char nt_hash[CIFS_NTHASH_SIZE]; - wchar_t *user; + __le16 *user; wchar_t *domain; wchar_t *server; @@ -439,7 +438,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, return rc; } - /* convert ses->user_name to unicode and uppercase */ + /* convert ses->user_name to unicode */ len = ses->user_name ? strlen(ses->user_name) : 0; user = kmalloc(2 + (len * 2), GFP_KERNEL); if (user == NULL) { @@ -448,7 +447,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } if (len) { - len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp); + len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp); UniStrupr(user); } else { memset(user, '\0', 2); @@ -536,7 +535,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return rc; } - if (ses->server->secType == RawNTLMSSP) + if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) memcpy(ses->auth_key.response + offset, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); else @@ -568,7 +567,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) char ntlmv2_hash[16]; unsigned char *tiblob = NULL; /* target info blob */ - if (ses->server->secType == RawNTLMSSP) { + if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) { if (!ses->domainName) { rc = find_domain_name(ses, nls_cp); if (rc) { @@ -706,6 +705,9 @@ calc_seckey(struct cifs_ses *ses) void cifs_crypto_shash_release(struct TCP_Server_Info *server) { + if (server->secmech.cmacaes) + crypto_free_shash(server->secmech.cmacaes); + if (server->secmech.hmacsha256) crypto_free_shash(server->secmech.hmacsha256); @@ -715,6 +717,8 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server) if (server->secmech.hmacmd5) crypto_free_shash(server->secmech.hmacmd5); + kfree(server->secmech.sdesccmacaes); + kfree(server->secmech.sdeschmacsha256); kfree(server->secmech.sdeschmacmd5); @@ -748,6 +752,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) goto crypto_allocate_hmacsha256_fail; } + server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0); + if (IS_ERR(server->secmech.cmacaes)) { + cifs_dbg(VFS, "could not allocate crypto cmac-aes"); + rc = PTR_ERR(server->secmech.cmacaes); + goto crypto_allocate_cmacaes_fail; + } + size = sizeof(struct shash_desc) + crypto_shash_descsize(server->secmech.hmacmd5); server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); @@ -778,8 +789,22 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server) server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256; server->secmech.sdeschmacsha256->shash.flags = 0x0; + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.cmacaes); + server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdesccmacaes) { + cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__); + rc = -ENOMEM; + goto crypto_allocate_cmacaes_sdesc_fail; + } + server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes; + server->secmech.sdesccmacaes->shash.flags = 0x0; + return 0; +crypto_allocate_cmacaes_sdesc_fail: + kfree(server->secmech.sdeschmacsha256); + crypto_allocate_hmacsha256_sdesc_fail: kfree(server->secmech.sdescmd5); @@ -787,6 +812,9 @@ crypto_allocate_md5_sdesc_fail: kfree(server->secmech.sdeschmacmd5); crypto_allocate_hmacmd5_sdesc_fail: + crypto_free_shash(server->secmech.cmacaes); + +crypto_allocate_cmacaes_fail: crypto_free_shash(server->secmech.hmacsha256); crypto_allocate_hmacsha256_fail: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 72e4efe..4bdd547 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -312,11 +312,14 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) } static void -cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) +cifs_show_security(struct seq_file *s, struct cifs_ses *ses) { + if (ses->sectype == Unspecified) + return; + seq_printf(s, ",sec="); - switch (server->secType) { + switch (ses->sectype) { case LANMAN: seq_printf(s, "lanman"); break; @@ -338,7 +341,7 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) break; } - if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (ses->sign) seq_printf(s, "i"); } @@ -369,12 +372,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root) srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); - cifs_show_security(s, tcon->ses->server); + cifs_show_security(s, tcon->ses); cifs_show_cache_flavor(s, cifs_sb); - seq_printf(s, ",unc="); - seq_escape(s, tcon->treeName, " \t\n\\"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_printf(s, ",multiuser"); else if (tcon->ses->user_name) @@ -768,7 +768,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) { - /* note that this is called by vfs setlease with lock_flocks held + /* note that this is called by vfs setlease with i_lock held to protect *lease from going away */ struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; @@ -971,7 +971,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { }; const struct file_operations cifs_dir_ops = { - .readdir = cifs_readdir, + .iterate = cifs_readdir, .release = cifs_closedir, .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 0e32c34..ea723a5 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -101,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *); extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); -extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); +extern int cifs_readdir(struct file *file, struct dir_context *ctx); /* Functions related to dir entries */ extern const struct dentry_operations cifs_dentry_ops; @@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.0" +#define CIFS_VERSION "2.01" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 4f07f6f..e66b088 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -101,20 +101,14 @@ enum statusEnum { }; enum securityEnum { - LANMAN = 0, /* Legacy LANMAN auth */ + Unspecified = 0, /* not specified */ + LANMAN, /* Legacy LANMAN auth */ NTLM, /* Legacy NTLM012 auth with NTLM hash */ NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ -/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */ Kerberos, /* Kerberos via SPNEGO */ }; -enum protocolEnum { - TCP = 0, - SCTP - /* Netbios frames protocol not supported at this time */ -}; - struct session_key { unsigned int len; char *response; @@ -131,9 +125,11 @@ struct cifs_secmech { struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ struct crypto_shash *md5; /* md5 hash function */ struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ + struct crypto_shash *cmacaes; /* block-cipher based MAC function */ struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ + struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */ }; /* per smb session structure/fields */ @@ -181,6 +177,7 @@ enum smb_version { Smb_20, Smb_21, Smb_30, + Smb_302, }; struct mid_q_entry; @@ -228,6 +225,7 @@ struct smb_version_operations { void (*dump_detail)(void *); void (*clear_stats)(struct cifs_tcon *); void (*print_stats)(struct seq_file *m, struct cifs_tcon *); + void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); @@ -367,6 +365,8 @@ struct smb_version_operations { void (*set_lease_key)(struct inode *, struct cifs_fid *fid); /* generate new lease key */ void (*new_lease_key)(struct cifs_fid *fid); + /* The next two functions will need to be changed to per smb session */ + void (*generate_signingkey)(struct TCP_Server_Info *server); int (*calc_signature)(struct smb_rqst *rqst, struct TCP_Server_Info *server); }; @@ -387,6 +387,8 @@ struct smb_version_values { unsigned int cap_nt_find; unsigned int cap_large_files; unsigned int oplock_read; + __u16 signing_enabled; + __u16 signing_required; }; #define HEADER_SIZE(server) (server->vals->header_size) @@ -407,7 +409,8 @@ struct smb_vol { kgid_t backupgid; umode_t file_mode; umode_t dir_mode; - unsigned secFlg; + enum securityEnum sectype; /* sectype requested via mnt opts */ + bool sign; /* was signing requested via mnt opts? */ bool retry:1; bool intr:1; bool setuids:1; @@ -441,6 +444,7 @@ struct smb_vol { bool mfsymlinks:1; /* use Minshall+French Symlinks */ bool multiuser:1; bool rwpidforward:1; /* pid forward for read/write operations */ + bool nosharesock; unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -514,6 +518,7 @@ struct TCP_Server_Info { struct task_struct *tsk; char server_GUID[16]; __u16 sec_mode; + bool sign; /* is signing enabled on this connection? */ bool session_estab; /* mark when very first sess is established */ #ifdef CONFIG_CIFS_SMB2 int echo_credits; /* echo reserved slots */ @@ -521,7 +526,6 @@ struct TCP_Server_Info { bool echoes:1; /* enable echoes */ #endif u16 dialect; /* dialect index that server chose */ - enum securityEnum secType; bool oplocks:1; /* enable oplocks */ unsigned int maxReq; /* Clients should submit no more */ /* than maxReq distinct unanswered SMBs to the server when using */ @@ -540,12 +544,17 @@ struct TCP_Server_Info { int timeAdj; /* Adjust for difference in server time zone in sec */ __u64 CurrentMid; /* multiplex id - rotating counter */ char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ + char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ +#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */ +#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */ +#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */ + char negflavor; /* NEGOTIATE response flavor */ /* extended security flavors that server supports */ bool sec_ntlmssp; /* supports NTLMSSP */ bool sec_kerberosu2u; /* supports U2U Kerberos */ @@ -697,7 +706,6 @@ struct cifs_ses { enum statusEnum status; unsigned overrideSecFlg; /* if non-zero override global sec flags */ __u16 ipc_tid; /* special tid for connection to IPC share */ - __u16 flags; __u16 vcnum; char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ @@ -714,21 +722,14 @@ struct cifs_ses { char *password; struct session_key auth_key; struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ + enum securityEnum sectype; /* what security flavor was specified? */ + bool sign; /* is signing required? */ bool need_reconnect:1; /* connection reset, uid now invalid */ #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; #endif /* CONFIG_CIFS_SMB2 */ }; -/* no more than one of the following three session flags may be set */ -#define CIFS_SES_NT4 1 -#define CIFS_SES_OS2 2 -#define CIFS_SES_W9X 4 -/* following flag is set for old servers such as OS2 (and Win95?) - which do not negotiate NTLM or POSIX dialects, but instead - negotiate one of the older LANMAN dialects */ -#define CIFS_SES_LANMAN 8 - static inline bool cap_unix(struct cifs_ses *ses) { @@ -816,7 +817,7 @@ struct cifs_tcon { #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ - __u32 capabilities; + __le32 capabilities; __u32 share_flags; __u32 maximal_access; __u32 vol_serial_number; @@ -1348,7 +1349,7 @@ require use of the stronger protocol */ #define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ -#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP) +#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) #define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) #define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) /* @@ -1494,4 +1495,7 @@ extern struct smb_version_values smb21_values; #define SMB30_VERSION_STRING "3.0" extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; +#define SMB302_VERSION_STRING "3.02" +/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ +extern struct smb_version_values smb302_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index e996ff6..11ca24a 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -142,6 +142,11 @@ */ #define CIFS_SESS_KEY_SIZE (16) +/* + * Size of the smb3 signing key + */ +#define SMB3_SIGN_KEY_SIZE (16) + #define CIFS_CLIENT_CHALLENGE_SIZE (8) #define CIFS_SERVER_CHALLENGE_SIZE (8) #define CIFS_HMAC_MD5_HASH_SIZE (16) @@ -531,7 +536,7 @@ typedef struct lanman_neg_rsp { #define READ_RAW_ENABLE 1 #define WRITE_RAW_ENABLE 2 #define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE) - +#define SMB1_CLIENT_GUID_SIZE (16) typedef struct negotiate_rsp { struct smb_hdr hdr; /* wct = 17 */ __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */ @@ -553,7 +558,7 @@ typedef struct negotiate_rsp { /* followed by 16 bytes of server GUID */ /* then security blob if cap_extended_security negotiated */ struct { - unsigned char GUID[16]; + unsigned char GUID[SMB1_CLIENT_GUID_SIZE]; unsigned char SecurityBlob[1]; } __attribute__((packed)) extended_response; } __attribute__((packed)) u; @@ -1315,6 +1320,14 @@ typedef struct smb_com_ntransact_rsp { /* parms and data follow */ } __attribute__((packed)) NTRANSACT_RSP; +/* See MS-SMB 2.2.7.2.1.1 */ +struct srv_copychunk { + __le64 SourceOffset; + __le64 DestinationOffset; + __le32 CopyLength; + __u32 Reserved; +} __packed; + typedef struct smb_com_transaction_ioctl_req { struct smb_hdr hdr; /* wct = 23 */ __u8 MaxSetupCount; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index dda188a..c8ff018 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -118,6 +118,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ , extern int small_smb_init_no_tc(const int smb_cmd, const int wct, struct cifs_ses *ses, void **request_buf); +extern enum securityEnum select_sectype(struct TCP_Server_Info *server, + enum securityEnum requested); extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp); extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); @@ -212,6 +214,7 @@ extern int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses); extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info); +extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required); extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses); extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, @@ -433,6 +436,7 @@ extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifs_ses *); +extern void generate_smb3signingkey(struct TCP_Server_Info *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern int calc_lanman_hash(const char *password, const char *cryptkey, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a58dc77..a89c4cb 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -367,6 +367,185 @@ vt2_err: return -EINVAL; } +static int +decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr) +{ + int rc = 0; + u16 count; + char *guid = pSMBr->u.extended_response.GUID; + struct TCP_Server_Info *server = ses->server; + + count = get_bcc(&pSMBr->hdr); + if (count < SMB1_CLIENT_GUID_SIZE) + return -EIO; + + spin_lock(&cifs_tcp_ses_lock); + if (server->srv_count > 1) { + spin_unlock(&cifs_tcp_ses_lock); + if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) { + cifs_dbg(FYI, "server UID changed\n"); + memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE); + } + } else { + spin_unlock(&cifs_tcp_ses_lock); + memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE); + } + + if (count == SMB1_CLIENT_GUID_SIZE) { + server->sec_ntlmssp = true; + } else { + count -= SMB1_CLIENT_GUID_SIZE; + rc = decode_negTokenInit( + pSMBr->u.extended_response.SecurityBlob, count, server); + if (rc != 1) + return -EINVAL; + } + + return 0; +} + +int +cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) +{ + bool srv_sign_required = server->sec_mode & server->vals->signing_required; + bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled; + bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN; + + /* + * Is signing required by mnt options? If not then check + * global_secflags to see if it is there. + */ + if (!mnt_sign_required) + mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) == + CIFSSEC_MUST_SIGN); + + /* + * If signing is required then it's automatically enabled too, + * otherwise, check to see if the secflags allow it. + */ + mnt_sign_enabled = mnt_sign_required ? mnt_sign_required : + (global_secflags & CIFSSEC_MAY_SIGN); + + /* If server requires signing, does client allow it? */ + if (srv_sign_required) { + if (!mnt_sign_enabled) { + cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!"); + return -ENOTSUPP; + } + server->sign = true; + } + + /* If client requires signing, does server allow it? */ + if (mnt_sign_required) { + if (!srv_sign_enabled) { + cifs_dbg(VFS, "Server does not support signing!"); + return -ENOTSUPP; + } + server->sign = true; + } + + return 0; +} + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +static int +decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) +{ + __s16 tmp; + struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; + + if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT) + return -EOPNOTSUPP; + + server->sec_mode = le16_to_cpu(rsp->SecurityMode); + server->maxReq = min_t(unsigned int, + le16_to_cpu(rsp->MaxMpxCount), + cifs_max_pending); + set_credits(server, server->maxReq); + server->maxBuf = le16_to_cpu(rsp->MaxBufSize); + server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); + /* even though we do not use raw we might as well set this + accurately, in case we ever find a need for it */ + if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { + server->max_rw = 0xFF00; + server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; + } else { + server->max_rw = 0;/* do not need to use raw anyway */ + server->capabilities = CAP_MPX_MODE; + } + tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); + if (tmp == -1) { + /* OS/2 often does not set timezone therefore + * we must use server time to calc time zone. + * Could deviate slightly from the right zone. + * Smallest defined timezone difference is 15 minutes + * (i.e. Nepal). Rounding up/down is done to match + * this requirement. + */ + int val, seconds, remain, result; + struct timespec ts, utc; + utc = CURRENT_TIME; + ts = cnvrtDosUnixTm(rsp->SrvTime.Date, + rsp->SrvTime.Time, 0); + cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n", + (int)ts.tv_sec, (int)utc.tv_sec, + (int)(utc.tv_sec - ts.tv_sec)); + val = (int)(utc.tv_sec - ts.tv_sec); + seconds = abs(val); + result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; + remain = seconds % MIN_TZ_ADJ; + if (remain >= (MIN_TZ_ADJ / 2)) + result += MIN_TZ_ADJ; + if (val < 0) + result = -result; + server->timeAdj = result; + } else { + server->timeAdj = (int)tmp; + server->timeAdj *= 60; /* also in seconds */ + } + cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj); + + + /* BB get server time for time conversions and add + code to use it and timezone since this is not UTC */ + + if (rsp->EncryptionKeyLength == + cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { + memcpy(server->cryptkey, rsp->EncryptionKey, + CIFS_CRYPTO_KEY_SIZE); + } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { + return -EIO; /* need cryptkey unless plain text */ + } + + cifs_dbg(FYI, "LANMAN negotiated\n"); + return 0; +} +#else +static inline int +decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) +{ + cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n"); + return -EOPNOTSUPP; +} +#endif + +static bool +should_set_ext_sec_flag(enum securityEnum sectype) +{ + switch (sectype) { + case RawNTLMSSP: + case Kerberos: + return true; + case Unspecified: + if (global_secflags & + (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)) + return true; + /* Fallthrough */ + default: + return false; + } +} + int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) { @@ -375,41 +554,24 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) int rc = 0; int bytes_returned; int i; - struct TCP_Server_Info *server; + struct TCP_Server_Info *server = ses->server; u16 count; - unsigned int secFlags; - if (ses->server) - server = ses->server; - else { - rc = -EIO; - return rc; + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; } + rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ , (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; - /* if any of auth flags (ie not sign or seal) are overriden use them */ - if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) - secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */ - else /* if override flags set only sign/seal OR them with global auth */ - secFlags = global_secflags | ses->overrideSecFlg; - - cifs_dbg(FYI, "secFlags 0x%x\n", secFlags); - pSMB->hdr.Mid = get_next_mid(server); pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); - if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) - pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; - else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) { - cifs_dbg(FYI, "Kerberos only mechanism, enable extended security\n"); - pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; - } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP) - pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; - else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) { - cifs_dbg(FYI, "NTLMSSP only mechanism, enable extended security\n"); + if (should_set_ext_sec_flag(ses->sectype)) { + cifs_dbg(FYI, "Requesting extended security."); pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; } @@ -436,127 +598,21 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) could not negotiate a common dialect */ rc = -EOPNOTSUPP; goto neg_err_exit; -#ifdef CONFIG_CIFS_WEAK_PW_HASH - } else if ((pSMBr->hdr.WordCount == 13) - && ((server->dialect == LANMAN_PROT) - || (server->dialect == LANMAN2_PROT))) { - __s16 tmp; - struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; - - if ((secFlags & CIFSSEC_MAY_LANMAN) || - (secFlags & CIFSSEC_MAY_PLNTXT)) - server->secType = LANMAN; - else { - cifs_dbg(VFS, "mount failed weak security disabled in /proc/fs/cifs/SecurityFlags\n"); - rc = -EOPNOTSUPP; - goto neg_err_exit; - } - server->sec_mode = le16_to_cpu(rsp->SecurityMode); - server->maxReq = min_t(unsigned int, - le16_to_cpu(rsp->MaxMpxCount), - cifs_max_pending); - set_credits(server, server->maxReq); - server->maxBuf = le16_to_cpu(rsp->MaxBufSize); - server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); - /* even though we do not use raw we might as well set this - accurately, in case we ever find a need for it */ - if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { - server->max_rw = 0xFF00; - server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; - } else { - server->max_rw = 0;/* do not need to use raw anyway */ - server->capabilities = CAP_MPX_MODE; - } - tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); - if (tmp == -1) { - /* OS/2 often does not set timezone therefore - * we must use server time to calc time zone. - * Could deviate slightly from the right zone. - * Smallest defined timezone difference is 15 minutes - * (i.e. Nepal). Rounding up/down is done to match - * this requirement. - */ - int val, seconds, remain, result; - struct timespec ts, utc; - utc = CURRENT_TIME; - ts = cnvrtDosUnixTm(rsp->SrvTime.Date, - rsp->SrvTime.Time, 0); - cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n", - (int)ts.tv_sec, (int)utc.tv_sec, - (int)(utc.tv_sec - ts.tv_sec)); - val = (int)(utc.tv_sec - ts.tv_sec); - seconds = abs(val); - result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; - remain = seconds % MIN_TZ_ADJ; - if (remain >= (MIN_TZ_ADJ / 2)) - result += MIN_TZ_ADJ; - if (val < 0) - result = -result; - server->timeAdj = result; - } else { - server->timeAdj = (int)tmp; - server->timeAdj *= 60; /* also in seconds */ - } - cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj); - - - /* BB get server time for time conversions and add - code to use it and timezone since this is not UTC */ - - if (rsp->EncryptionKeyLength == - cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { - memcpy(ses->server->cryptkey, rsp->EncryptionKey, - CIFS_CRYPTO_KEY_SIZE); - } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { - rc = -EIO; /* need cryptkey unless plain text */ - goto neg_err_exit; - } - - cifs_dbg(FYI, "LANMAN negotiated\n"); - /* we will not end up setting signing flags - as no signing - was in LANMAN and server did not return the flags on */ - goto signing_check; -#else /* weak security disabled */ } else if (pSMBr->hdr.WordCount == 13) { - cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n"); - rc = -EOPNOTSUPP; -#endif /* WEAK_PW_HASH */ - goto neg_err_exit; + server->negflavor = CIFS_NEGFLAVOR_LANMAN; + rc = decode_lanman_negprot_rsp(server, pSMBr); + goto signing_check; } else if (pSMBr->hdr.WordCount != 17) { /* unknown wct */ rc = -EOPNOTSUPP; goto neg_err_exit; } - /* else wct == 17 NTLM */ + /* else wct == 17, NTLM or better */ + server->sec_mode = pSMBr->SecurityMode; if ((server->sec_mode & SECMODE_USER) == 0) cifs_dbg(FYI, "share mode security\n"); - if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0) -#ifdef CONFIG_CIFS_WEAK_PW_HASH - if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0) -#endif /* CIFS_WEAK_PW_HASH */ - cifs_dbg(VFS, "Server requests plain text password but client support disabled\n"); - - if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) - server->secType = NTLMv2; - else if (secFlags & CIFSSEC_MAY_NTLM) - server->secType = NTLM; - else if (secFlags & CIFSSEC_MAY_NTLMV2) - server->secType = NTLMv2; - else if (secFlags & CIFSSEC_MAY_KRB5) - server->secType = Kerberos; - else if (secFlags & CIFSSEC_MAY_NTLMSSP) - server->secType = RawNTLMSSP; - else if (secFlags & CIFSSEC_MAY_LANMAN) - server->secType = LANMAN; - else { - rc = -EOPNOTSUPP; - cifs_dbg(VFS, "Invalid security type\n"); - goto neg_err_exit; - } - /* else ... any others ...? */ - /* one byte, so no need to convert this or EncryptionKeyLen from little endian */ server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount), @@ -569,90 +625,26 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->capabilities = le32_to_cpu(pSMBr->Capabilities); server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; + if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { + server->negflavor = CIFS_NEGFLAVOR_UNENCAP; memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, CIFS_CRYPTO_KEY_SIZE); } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || server->capabilities & CAP_EXTENDED_SECURITY) && (pSMBr->EncryptionKeyLength == 0)) { - /* decode security blob */ - count = get_bcc(&pSMBr->hdr); - if (count < 16) { - rc = -EIO; - goto neg_err_exit; - } - spin_lock(&cifs_tcp_ses_lock); - if (server->srv_count > 1) { - spin_unlock(&cifs_tcp_ses_lock); - if (memcmp(server->server_GUID, - pSMBr->u.extended_response. - GUID, 16) != 0) { - cifs_dbg(FYI, "server UID changed\n"); - memcpy(server->server_GUID, - pSMBr->u.extended_response.GUID, - 16); - } - } else { - spin_unlock(&cifs_tcp_ses_lock); - memcpy(server->server_GUID, - pSMBr->u.extended_response.GUID, 16); - } - - if (count == 16) { - server->secType = RawNTLMSSP; - } else { - rc = decode_negTokenInit(pSMBr->u.extended_response. - SecurityBlob, count - 16, - server); - if (rc == 1) - rc = 0; - else - rc = -EINVAL; - if (server->secType == Kerberos) { - if (!server->sec_kerberos && - !server->sec_mskerberos) - rc = -EOPNOTSUPP; - } else if (server->secType == RawNTLMSSP) { - if (!server->sec_ntlmssp) - rc = -EOPNOTSUPP; - } else - rc = -EOPNOTSUPP; - } + server->negflavor = CIFS_NEGFLAVOR_EXTENDED; + rc = decode_ext_sec_blob(ses, pSMBr); } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { rc = -EIO; /* no crypt key only if plain text pwd */ - goto neg_err_exit; - } else - server->capabilities &= ~CAP_EXTENDED_SECURITY; - -#ifdef CONFIG_CIFS_WEAK_PW_HASH -signing_check: -#endif - if ((secFlags & CIFSSEC_MAY_SIGN) == 0) { - /* MUST_SIGN already includes the MAY_SIGN FLAG - so if this is zero it means that signing is disabled */ - cifs_dbg(FYI, "Signing disabled\n"); - if (server->sec_mode & SECMODE_SIGN_REQUIRED) { - cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n"); - rc = -EOPNOTSUPP; - } - server->sec_mode &= - ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); - } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { - /* signing required */ - cifs_dbg(FYI, "Must sign - secFlags 0x%x\n", secFlags); - if ((server->sec_mode & - (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) { - cifs_dbg(VFS, "signing required but server lacks support\n"); - rc = -EOPNOTSUPP; - } else - server->sec_mode |= SECMODE_SIGN_REQUIRED; } else { - /* signing optional ie CIFSSEC_MAY_SIGN */ - if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0) - server->sec_mode &= - ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); + server->negflavor = CIFS_NEGFLAVOR_UNENCAP; + server->capabilities &= ~CAP_EXTENDED_SECURITY; } +signing_check: + if (!rc) + rc = cifs_enable_signing(server, ses->sign); neg_err_exit: cifs_buf_release(pSMB); @@ -777,9 +769,8 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses) pSMB->hdr.Mid = get_next_mid(ses->server); - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + if (ses->server->sign) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; pSMB->hdr.Uid = ses->Suid; @@ -1540,8 +1531,7 @@ cifs_readv_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: /* result already set, check signature */ - if (server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (server->sign) { int rc = 0; rc = cifs_verify_signature(&rqst, server, @@ -3940,6 +3930,7 @@ QFileInfoRetry: pSMB->Pad = 0; pSMB->Fid = netfid; inc_rfc1001_len(pSMB, byte_count); + pSMB->t2.ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -4108,6 +4099,7 @@ UnixQFileInfoRetry: pSMB->Pad = 0; pSMB->Fid = netfid; inc_rfc1001_len(pSMB, byte_count); + pSMB->t2.ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -4794,11 +4786,8 @@ getDFSRetry: strncpy(pSMB->RequestFileName, search_name, name_len); } - if (ses->server) { - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; - } + if (ses->server && ses->server->sign) + pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; pSMB->hdr.Uid = ses->Suid; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 99eeaa1..afcb8a1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -85,7 +85,7 @@ enum { Opt_acl, Opt_noacl, Opt_locallease, Opt_sign, Opt_seal, Opt_noac, Opt_fsc, Opt_mfsymlinks, - Opt_multiuser, Opt_sloppy, + Opt_multiuser, Opt_sloppy, Opt_nosharesock, /* Mount options which take numeric value */ Opt_backupuid, Opt_backupgid, Opt_uid, @@ -165,6 +165,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_mfsymlinks, "mfsymlinks" }, { Opt_multiuser, "multiuser" }, { Opt_sloppy, "sloppy" }, + { Opt_nosharesock, "nosharesock" }, { Opt_backupuid, "backupuid=%s" }, { Opt_backupgid, "backupgid=%s" }, @@ -275,6 +276,7 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_20, SMB20_VERSION_STRING}, { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, + { Smb_302, SMB302_VERSION_STRING }, }; static int ip_connect(struct TCP_Server_Info *server); @@ -1024,39 +1026,44 @@ static int cifs_parse_security_flavors(char *value, substring_t args[MAX_OPT_ARGS]; + /* + * With mount options, the last one should win. Reset any existing + * settings back to default. + */ + vol->sectype = Unspecified; + vol->sign = false; + switch (match_token(value, cifs_secflavor_tokens, args)) { - case Opt_sec_krb5: - vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN; - break; - case Opt_sec_krb5i: - vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN; - break; case Opt_sec_krb5p: - /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */ - cifs_dbg(VFS, "Krb5 cifs privacy not supported\n"); - break; - case Opt_sec_ntlmssp: - vol->secFlg |= CIFSSEC_MAY_NTLMSSP; + cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + return 1; + case Opt_sec_krb5i: + vol->sign = true; + /* Fallthrough */ + case Opt_sec_krb5: + vol->sectype = Kerberos; break; case Opt_sec_ntlmsspi: - vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN; - break; - case Opt_ntlm: - /* ntlm is default so can be turned off too */ - vol->secFlg |= CIFSSEC_MAY_NTLM; + vol->sign = true; + /* Fallthrough */ + case Opt_sec_ntlmssp: + vol->sectype = RawNTLMSSP; break; case Opt_sec_ntlmi: - vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN; - break; - case Opt_sec_ntlmv2: - vol->secFlg |= CIFSSEC_MAY_NTLMV2; + vol->sign = true; + /* Fallthrough */ + case Opt_ntlm: + vol->sectype = NTLM; break; case Opt_sec_ntlmv2i: - vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN; + vol->sign = true; + /* Fallthrough */ + case Opt_sec_ntlmv2: + vol->sectype = NTLMv2; break; #ifdef CONFIG_CIFS_WEAK_PW_HASH case Opt_sec_lanman: - vol->secFlg |= CIFSSEC_MAY_LANMAN; + vol->sectype = LANMAN; break; #endif case Opt_sec_none: @@ -1118,6 +1125,10 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->ops = &smb30_operations; vol->vals = &smb30_values; break; + case Smb_302: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb302_values; + break; #endif default: cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); @@ -1257,14 +1268,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->backupuid_specified = false; /* no backup intent for a user */ vol->backupgid_specified = false; /* no backup intent for a group */ - /* - * For now, we ignore -EINVAL errors under the assumption that the - * unc= and prefixpath= options will be usable. - */ - if (cifs_parse_devname(devname, vol) == -ENOMEM) { - printk(KERN_ERR "CIFS: Unable to allocate memory to parse " - "device string.\n"); - goto out_nomem; + switch (cifs_parse_devname(devname, vol)) { + case 0: + break; + case -ENOMEM: + cifs_dbg(VFS, "Unable to allocate memory for devname.\n"); + goto cifs_parse_mount_err; + case -EINVAL: + cifs_dbg(VFS, "Malformed UNC in devname.\n"); + goto cifs_parse_mount_err; + default: + cifs_dbg(VFS, "Unknown error parsing devname.\n"); + goto cifs_parse_mount_err; } while ((data = strsep(&options, separator)) != NULL) { @@ -1419,7 +1434,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->local_lease = 1; break; case Opt_sign: - vol->secFlg |= CIFSSEC_MUST_SIGN; + vol->sign = true; break; case Opt_seal: /* we do not do the following in secFlags because seal @@ -1450,6 +1465,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_sloppy: sloppy = true; break; + case Opt_nosharesock: + vol->nosharesock = true; + break; /* Numeric Values */ case Opt_backupuid: @@ -1826,7 +1844,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } #endif if (!vol->UNC) { - cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string or in unc= option!\n"); + cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n"); goto cifs_parse_mount_err; } @@ -1973,47 +1991,21 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr, static bool match_security(struct TCP_Server_Info *server, struct smb_vol *vol) { - unsigned int secFlags; - - if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) - secFlags = vol->secFlg; - else - secFlags = global_secflags | vol->secFlg; - - switch (server->secType) { - case LANMAN: - if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT))) - return false; - break; - case NTLMv2: - if (!(secFlags & CIFSSEC_MAY_NTLMV2)) - return false; - break; - case NTLM: - if (!(secFlags & CIFSSEC_MAY_NTLM)) - return false; - break; - case Kerberos: - if (!(secFlags & CIFSSEC_MAY_KRB5)) - return false; - break; - case RawNTLMSSP: - if (!(secFlags & CIFSSEC_MAY_NTLMSSP)) - return false; - break; - default: - /* shouldn't happen */ + /* + * The select_sectype function should either return the vol->sectype + * that was specified, or "Unspecified" if that sectype was not + * compatible with the given NEGOTIATE request. + */ + if (select_sectype(server, vol->sectype) == Unspecified) return false; - } - /* now check if signing mode is acceptable */ - if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && - (server->sec_mode & SECMODE_SIGN_REQUIRED)) - return false; - else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) && - (server->sec_mode & - (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0) - return false; + /* + * Now check if signing mode is acceptable. No need to check + * global_secflags at this point since if MUST_SIGN is set then + * the server->sign had better be too. + */ + if (vol->sign && !server->sign) + return false; return true; } @@ -2022,6 +2014,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) { struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr; + if (vol->nosharesock) + return 0; + if ((server->vals != vol->vals) || (server->ops != vol->ops)) return 0; @@ -2211,7 +2206,11 @@ out_err: static int match_session(struct cifs_ses *ses, struct smb_vol *vol) { - switch (ses->server->secType) { + if (vol->sectype != Unspecified && + vol->sectype != ses->sectype) + return 0; + + switch (ses->sectype) { case Kerberos: if (!uid_eq(vol->cred_uid, ses->cred_uid)) return 0; @@ -2488,7 +2487,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; - ses->overrideSecFlg = volume_info->secFlg; + ses->sectype = volume_info->sectype; + ses->sign = volume_info->sign; mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); @@ -3274,8 +3274,8 @@ build_unc_path_to_root(const struct smb_vol *vol, pos = full_path + unc_len; if (pplen) { - *pos++ = CIFS_DIR_SEP(cifs_sb); - strncpy(pos, vol->prepath, pplen); + *pos = CIFS_DIR_SEP(cifs_sb); + strncpy(pos + 1, vol->prepath, pplen); pos += pplen; } @@ -3651,7 +3651,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, NTLMv2 password here) */ #ifdef CONFIG_CIFS_WEAK_PW_HASH if ((global_secflags & CIFSSEC_MAY_LANMAN) && - (ses->server->secType == LANMAN)) + (ses->sectype == LANMAN)) calc_lanman_hash(tcon->password, ses->server->cryptkey, ses->server->sec_mode & SECMODE_PW_ENCRYPT ? true : false, @@ -3669,8 +3669,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, } } - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (ses->server->sign) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (ses->capabilities & CAP_STATUS32) { @@ -3733,7 +3732,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, } bcc_ptr += length + 1; bytes_left -= (length + 1); - strncpy(tcon->treeName, tree, MAX_TREE_SIZE); + strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); /* mostly informational -- no need to fail on error here */ kfree(tcon->nativeFileSystem); @@ -3822,7 +3821,6 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, int rc = -ENOSYS; struct TCP_Server_Info *server = ses->server; - ses->flags = 0; ses->capabilities = server->capabilities; if (linuxExtEnabled == 0) ses->capabilities &= (~server->vals->cap_unix); @@ -3843,6 +3841,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, server->sequence_number = 0x2; server->session_estab = true; ses->auth_key.response = NULL; + if (server->ops->generate_signingkey) + server->ops->generate_signingkey(server); } mutex_unlock(&server->srv_mutex); @@ -3865,23 +3865,11 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, static int cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) { - switch (ses->server->secType) { - case Kerberos: - vol->secFlg = CIFSSEC_MUST_KRB5; + vol->sectype = ses->sectype; + + /* krb5 is special, since we don't need username or pw */ + if (vol->sectype == Kerberos) return 0; - case NTLMv2: - vol->secFlg = CIFSSEC_MUST_NTLMV2; - break; - case NTLM: - vol->secFlg = CIFSSEC_MUST_NTLM; - break; - case RawNTLMSSP: - vol->secFlg = CIFSSEC_MUST_NTLMSSP; - break; - case LANMAN: - vol->secFlg = CIFSSEC_MUST_LANMAN; - break; - } return cifs_set_cifscreds(vol, ses); } @@ -3907,6 +3895,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) vol_info->nocase = master_tcon->nocase; vol_info->local_lease = master_tcon->local_lease; vol_info->no_linux_ext = !master_tcon->unix_ext; + vol_info->sectype = master_tcon->ses->sectype; + vol_info->sign = master_tcon->ses->sign; rc = cifs_set_vol_auth(vol_info, master_tcon->ses); if (rc) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 5699b50..5175aeb 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -822,8 +822,7 @@ const struct dentry_operations cifs_dentry_ops = { /* d_delete: cifs_d_delete, */ /* not needed except for debugging */ }; -static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *q) +static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) { struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; unsigned long hash; @@ -838,12 +837,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, return 0; } -static int cifs_ci_compare(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls; + struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls; if ((name->len == len) && (nls_strnicmp(codepage, name->name, str, len) == 0)) diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index e7512e4..7ede730 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -34,7 +34,7 @@ /** * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. - * @unc: UNC path specifying the server + * @unc: UNC path specifying the server (with '/' as delimiter) * @ip_addr: Where to return the IP address. * * The IP address will be returned in string form, and the caller is @@ -64,7 +64,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) hostname = unc + 2; /* Search for server name delimiter */ - sep = memchr(hostname, '\\', len); + sep = memchr(hostname, '/', len); if (sep) len = sep - hostname; else diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 48b29d2..91d8629 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -999,7 +999,7 @@ try_again: rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); if (!rc) goto try_again; - locks_delete_block(flock); + posix_unblock_lock(flock); } return rc; } @@ -1092,6 +1092,7 @@ struct lock_to_push { static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { + struct inode *inode = cfile->dentry->d_inode; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock, **before; unsigned int count = 0, i = 0; @@ -1102,12 +1103,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) xid = get_xid(); - lock_flocks(); - cifs_for_each_lock(cfile->dentry->d_inode, before) { + spin_lock(&inode->i_lock); + cifs_for_each_lock(inode, before) { if ((*before)->fl_flags & FL_POSIX) count++; } - unlock_flocks(); + spin_unlock(&inode->i_lock); INIT_LIST_HEAD(&locks_to_send); @@ -1126,8 +1127,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) } el = locks_to_send.next; - lock_flocks(); - cifs_for_each_lock(cfile->dentry->d_inode, before) { + spin_lock(&inode->i_lock); + cifs_for_each_lock(inode, before) { flock = *before; if ((flock->fl_flags & FL_POSIX) == 0) continue; @@ -1152,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) lck->offset = flock->fl_start; el = el->next; } - unlock_flocks(); + spin_unlock(&inode->i_lock); list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { int stored_rc; @@ -3546,11 +3547,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp) return cifs_fscache_release_page(page, gfp); } -static void cifs_invalidate_page(struct page *page, unsigned long offset) +static void cifs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index fc30251..20efd81 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -171,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) inode->i_flags |= S_AUTOMOUNT; - cifs_set_ops(inode); + if (inode->i_state & I_NEW) + cifs_set_ops(inode); } void diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1bec014..f7d4b22 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -267,8 +267,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , if (treeCon->nocase) buffer->Flags |= SMBFLG_CASELESS; if ((treeCon->ses) && (treeCon->ses->server)) - if (treeCon->ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (treeCon->ses->server->sign) buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 770d5a9..ab87784 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -126,6 +126,22 @@ out: dput(dentry); } +/* + * Is it possible that this directory might turn out to be a DFS referral + * once we go to try and use it? + */ +static bool +cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb) +{ +#ifdef CONFIG_CIFS_DFS_UPCALL + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + + if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + return true; +#endif + return false; +} + static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { @@ -135,6 +151,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; + /* + * Windows CIFS servers generally make DFS referrals look + * like directories in FIND_* responses with the reparse + * attribute flag also set (since DFS junctions are + * reparse points). We must revalidate at least these + * directory inodes before trying to use them (if + * they are DFS we will get PATH_NOT_COVERED back + * when queried directly and can then try to connect + * to the DFS target) + */ + if (cifs_dfs_is_possible(cifs_sb) && + (fattr->cf_cifsattrs & ATTR_REPARSE)) + fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; @@ -537,14 +566,14 @@ static int cifs_save_resume_key(const char *current_entry, * every entry (do not increment for . or .. entry). */ static int -find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, +find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, struct file *file, char **current_entry, int *num_to_ret) { __u16 search_flags; int rc = 0; int pos_in_buf = 0; loff_t first_entry_in_buffer; - loff_t index_to_find = file->f_pos; + loff_t index_to_find = pos; struct cifsFileInfo *cfile = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); struct TCP_Server_Info *server = tcon->ses->server; @@ -659,8 +688,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, - void *dirent, char *scratch_buf, unsigned int max_len) +static int cifs_filldir(char *find_entry, struct file *file, + struct dir_context *ctx, + char *scratch_buf, unsigned int max_len) { struct cifsFileInfo *file_info = file->private_data; struct super_block *sb = file->f_path.dentry->d_sb; @@ -740,13 +770,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, cifs_prime_dcache(file->f_dentry, &name, &fattr); ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - rc = filldir(dirent, name.name, name.len, file->f_pos, ino, - fattr.cf_dtype); - return rc; + return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype); } -int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) +int cifs_readdir(struct file *file, struct dir_context *ctx) { int rc = 0; unsigned int xid; @@ -772,103 +800,86 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) goto rddir2_exit; } - switch ((int) file->f_pos) { - case 0: - if (filldir(direntry, ".", 1, file->f_pos, - file_inode(file)->i_ino, DT_DIR) < 0) { - cifs_dbg(VFS, "Filldir for current dir failed\n"); - rc = -ENOMEM; - break; - } - file->f_pos++; - case 1: - if (filldir(direntry, "..", 2, file->f_pos, - parent_ino(file->f_path.dentry), DT_DIR) < 0) { - cifs_dbg(VFS, "Filldir for parent dir failed\n"); - rc = -ENOMEM; - break; - } - file->f_pos++; - default: - /* 1) If search is active, - is in current search buffer? - if it before then restart search - if after then keep searching till find it */ - - if (file->private_data == NULL) { - rc = -EINVAL; - free_xid(xid); - return rc; - } - cifsFile = file->private_data; - if (cifsFile->srch_inf.endOfSearch) { - if (cifsFile->srch_inf.emptyDir) { - cifs_dbg(FYI, "End of search, empty dir\n"); - rc = 0; - break; - } - } /* else { - cifsFile->invalidHandle = true; - tcon->ses->server->close(xid, tcon, &cifsFile->fid); - } */ + if (!dir_emit_dots(file, ctx)) + goto rddir2_exit; - tcon = tlink_tcon(cifsFile->tlink); - rc = find_cifs_entry(xid, tcon, file, ¤t_entry, - &num_to_fill); - if (rc) { - cifs_dbg(FYI, "fce error %d\n", rc); - goto rddir2_exit; - } else if (current_entry != NULL) { - cifs_dbg(FYI, "entry %lld found\n", file->f_pos); - } else { - cifs_dbg(FYI, "could not find entry\n"); + /* 1) If search is active, + is in current search buffer? + if it before then restart search + if after then keep searching till find it */ + + if (file->private_data == NULL) { + rc = -EINVAL; + goto rddir2_exit; + } + cifsFile = file->private_data; + if (cifsFile->srch_inf.endOfSearch) { + if (cifsFile->srch_inf.emptyDir) { + cifs_dbg(FYI, "End of search, empty dir\n"); + rc = 0; goto rddir2_exit; } - cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", - num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); - max_len = tcon->ses->server->ops->calc_smb_size( - cifsFile->srch_inf.ntwrk_buf_start); - end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; - - tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); - if (tmp_buf == NULL) { - rc = -ENOMEM; + } /* else { + cifsFile->invalidHandle = true; + tcon->ses->server->close(xid, tcon, &cifsFile->fid); + } */ + + tcon = tlink_tcon(cifsFile->tlink); + rc = find_cifs_entry(xid, tcon, ctx->pos, file, ¤t_entry, + &num_to_fill); + if (rc) { + cifs_dbg(FYI, "fce error %d\n", rc); + goto rddir2_exit; + } else if (current_entry != NULL) { + cifs_dbg(FYI, "entry %lld found\n", ctx->pos); + } else { + cifs_dbg(FYI, "could not find entry\n"); + goto rddir2_exit; + } + cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", + num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); + max_len = tcon->ses->server->ops->calc_smb_size( + cifsFile->srch_inf.ntwrk_buf_start); + end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; + + tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); + if (tmp_buf == NULL) { + rc = -ENOMEM; + goto rddir2_exit; + } + + for (i = 0; i < num_to_fill; i++) { + if (current_entry == NULL) { + /* evaluate whether this case is an error */ + cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n", + num_to_fill, i); break; } - - for (i = 0; (i < num_to_fill) && (rc == 0); i++) { - if (current_entry == NULL) { - /* evaluate whether this case is an error */ - cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n", - num_to_fill, i); - break; - } - /* - * if buggy server returns . and .. late do we want to - * check for that here? - */ - rc = cifs_filldir(current_entry, file, filldir, - direntry, tmp_buf, max_len); - if (rc == -EOVERFLOW) { + /* + * if buggy server returns . and .. late do we want to + * check for that here? + */ + rc = cifs_filldir(current_entry, file, ctx, + tmp_buf, max_len); + if (rc) { + if (rc > 0) rc = 0; - break; - } - - file->f_pos++; - if (file->f_pos == - cifsFile->srch_inf.index_of_last_entry) { - cifs_dbg(FYI, "last entry in buf at pos %lld %s\n", - file->f_pos, tmp_buf); - cifs_save_resume_key(current_entry, cifsFile); - break; - } else - current_entry = - nxt_dir_entry(current_entry, end_of_smb, - cifsFile->srch_inf.info_level); + break; } - kfree(tmp_buf); - break; - } /* end switch */ + + ctx->pos++; + if (ctx->pos == + cifsFile->srch_inf.index_of_last_entry) { + cifs_dbg(FYI, "last entry in buf at pos %lld %s\n", + ctx->pos, tmp_buf); + cifs_save_resume_key(current_entry, cifsFile); + break; + } else + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); + } + kfree(tmp_buf); rddir2_exit: free_xid(xid); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index f230571..79358e3 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -138,8 +138,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (ses->server->sign) pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (ses->capabilities & CAP_UNICODE) { @@ -310,11 +309,10 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses, return; } -static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, - struct cifs_ses *ses, - const struct nls_table *nls_cp) +static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, + struct cifs_ses *ses, + const struct nls_table *nls_cp) { - int rc = 0; int len; char *bcc_ptr = *pbcc_area; @@ -322,24 +320,22 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, len = strnlen(bcc_ptr, bleft); if (len >= bleft) - return rc; + return; kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); if (ses->serverOS) strncpy(ses->serverOS, bcc_ptr, len); - if (strncmp(ses->serverOS, "OS/2", 4) == 0) { + if (strncmp(ses->serverOS, "OS/2", 4) == 0) cifs_dbg(FYI, "OS/2 server\n"); - ses->flags |= CIFS_SES_OS2; - } bcc_ptr += len + 1; bleft -= len + 1; len = strnlen(bcc_ptr, bleft); if (len >= bleft) - return rc; + return; kfree(ses->serverNOS); @@ -352,7 +348,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, len = strnlen(bcc_ptr, bleft); if (len > bleft) - return rc; + return; /* No domain field in LANMAN case. Domain is returned by old servers in the SMB negprot response */ @@ -360,8 +356,6 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, but thus do return domain here we could add parsing for it later, but it is not very important */ cifs_dbg(FYI, "ascii: bytes left %d\n", bleft); - - return rc; } int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, @@ -432,8 +426,7 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (ses->server->sign) { flags |= NTLMSSP_NEGOTIATE_SIGN; if (!ses->server->session_estab) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; @@ -471,8 +464,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; - if (ses->server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (ses->server->sign) { flags |= NTLMSSP_NEGOTIATE_SIGN; if (!ses->server->session_estab) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; @@ -558,6 +550,56 @@ setup_ntlmv2_ret: return rc; } +enum securityEnum +select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) +{ + switch (server->negflavor) { + case CIFS_NEGFLAVOR_EXTENDED: + switch (requested) { + case Kerberos: + case RawNTLMSSP: + return requested; + case Unspecified: + if (server->sec_ntlmssp && + (global_secflags & CIFSSEC_MAY_NTLMSSP)) + return RawNTLMSSP; + if ((server->sec_kerberos || server->sec_mskerberos) && + (global_secflags & CIFSSEC_MAY_KRB5)) + return Kerberos; + /* Fallthrough */ + default: + return Unspecified; + } + case CIFS_NEGFLAVOR_UNENCAP: + switch (requested) { + case NTLM: + case NTLMv2: + return requested; + case Unspecified: + if (global_secflags & CIFSSEC_MAY_NTLMV2) + return NTLMv2; + if (global_secflags & CIFSSEC_MAY_NTLM) + return NTLM; + /* Fallthrough */ + default: + return Unspecified; + } + case CIFS_NEGFLAVOR_LANMAN: + switch (requested) { + case LANMAN: + return requested; + case Unspecified: + if (global_secflags & CIFSSEC_MAY_LANMAN) + return LANMAN; + /* Fallthrough */ + default: + return Unspecified; + } + default: + return Unspecified; + } +} + int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp) @@ -579,11 +621,18 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, u16 blob_len; char *ntlmsspblob = NULL; - if (ses == NULL) + if (ses == NULL) { + WARN(1, "%s: ses == NULL!", __func__); return -EINVAL; + } - type = ses->server->secType; + type = select_sectype(ses->server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); + if (type == Unspecified) { + cifs_dbg(VFS, "Unable to select appropriate authentication method!"); + return -EINVAL; + } + if (type == RawNTLMSSP) { /* if memory allocation is successful, caller of this function * frees it. @@ -643,8 +692,6 @@ ssetup_ntlmssp_authenticate: } bcc_ptr = str_area; - ses->flags &= ~CIFS_SES_LANMAN; - iov[1].iov_base = NULL; iov[1].iov_len = 0; @@ -668,7 +715,6 @@ ssetup_ntlmssp_authenticate: ses->server->sec_mode & SECMODE_PW_ENCRYPT ? true : false, lnm_session_key); - ses->flags |= CIFS_SES_LANMAN; memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); bcc_ptr += CIFS_AUTH_RESP_SIZE; @@ -938,8 +984,7 @@ ssetup_ntlmssp_authenticate: } decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); } else { - rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, - ses, nls_cp); + decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); } ssetup_exit: diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 3efdb9d..e813f04 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -449,8 +449,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) * WRITEX header, not including the 4 byte RFC1001 length. */ if (!(server->capabilities & CAP_LARGE_WRITE_X) || - (!(server->capabilities & CAP_UNIX) && - (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)))) + (!(server->capabilities & CAP_UNIX) && server->sign)) wsize = min_t(unsigned int, wsize, server->maxBuf - sizeof(WRITE_REQ) + 4); @@ -765,20 +764,14 @@ smb_set_file_info(struct inode *inode, const char *full_path, } tcon = tlink_tcon(tlink); - /* - * NT4 apparently returns success on this call, but it doesn't really - * work. - */ - if (!(tcon->ses->flags & CIFS_SES_NT4)) { - rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, - cifs_sb->local_nls, + rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - cinode->cifsAttrs = le32_to_cpu(buf->Attributes); - goto out; - } else if (rc != -EOPNOTSUPP && rc != -EINVAL) - goto out; + if (rc == 0) { + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + goto out; + } else if (rc != -EOPNOTSUPP && rc != -EINVAL) { + goto out; } cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); @@ -964,4 +957,6 @@ struct smb_version_values smb1_values = { .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, .oplock_read = OPLOCK_READ, + .signing_enabled = SECMODE_SIGN_ENABLED, + .signing_required = SECMODE_SIGN_REQUIRED, }; diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 7c0e214..c383508 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -54,5 +54,7 @@ #define SMB2_SIGNATURE_SIZE (16) #define SMB2_NTLMV2_SESSKEY_SIZE (16) #define SMB2_HMACSHA256_SIZE (32) +#define SMB2_CMACAES_SIZE (16) +#define SMB3_SIGNKEY_SIZE (16) #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 10383d8..b0c4334 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -266,6 +266,10 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength); break; case SMB2_IOCTL: + *off = le32_to_cpu( + ((struct smb2_ioctl_rsp *)hdr)->OutputOffset); + *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount); + break; case SMB2_CHANGE_NOTIFY: default: /* BB FIXME for unimplemented cases above */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f2e76f3..6d15cab 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -281,6 +281,25 @@ smb2_clear_stats(struct cifs_tcon *tcon) } static void +smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon) +{ + seq_puts(m, "\n\tShare Capabilities:"); + if (tcon->capabilities & SMB2_SHARE_CAP_DFS) + seq_puts(m, " DFS,"); + if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY) + seq_puts(m, " CONTINUOUS AVAILABILITY,"); + if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT) + seq_puts(m, " SCALEOUT,"); + if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER) + seq_puts(m, " CLUSTER,"); + if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC) + seq_puts(m, " ASYMMETRIC,"); + if (tcon->capabilities == 0) + seq_puts(m, " None"); + seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags); +} + +static void smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) { #ifdef CONFIG_CIFS_STATS @@ -292,7 +311,6 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, "\nSessionSetups: %d sent %d failed", atomic_read(&sent[SMB2_SESSION_SETUP_HE]), atomic_read(&failed[SMB2_SESSION_SETUP_HE])); -#define SMB2LOGOFF 0x0002 /* trivial request/resp */ seq_printf(m, "\nLogoffs: %d sent %d failed", atomic_read(&sent[SMB2_LOGOFF_HE]), atomic_read(&failed[SMB2_LOGOFF_HE])); @@ -645,6 +663,7 @@ struct smb_version_operations smb30_operations = { .dump_detail = smb2_dump_detail, .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, + .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -690,6 +709,7 @@ struct smb_version_operations smb30_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, + .generate_signingkey = generate_smb3signingkey, .calc_signature = smb3_calc_signature, }; @@ -709,6 +729,8 @@ struct smb_version_values smb20_values = { .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, .oplock_read = SMB2_OPLOCK_LEVEL_II, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, }; struct smb_version_values smb21_values = { @@ -727,6 +749,8 @@ struct smb_version_values smb21_values = { .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, .oplock_read = SMB2_OPLOCK_LEVEL_II, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, }; struct smb_version_values smb30_values = { @@ -745,4 +769,26 @@ struct smb_version_values smb30_values = { .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, .oplock_read = SMB2_OPLOCK_LEVEL_II, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, +}; + +struct smb_version_values smb302_values = { + .version_string = SMB302_VERSION_STRING, + .protocol_id = SMB302_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, }; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2b95ce2b..2b312e4 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1,7 +1,7 @@ /* * fs/cifs/smb2pdu.c * - * Copyright (C) International Business Machines Corp., 2009, 2012 + * Copyright (C) International Business Machines Corp., 2009, 2013 * Etersoft, 2012 * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 @@ -108,19 +108,33 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , if (!tcon) goto out; + /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */ + /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ + /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ + if ((tcon->ses) && + (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + hdr->CreditCharge = cpu_to_le16(1); + /* else CreditCharge MBZ */ + hdr->TreeId = tcon->tid; /* Uid is not converted */ if (tcon->ses) hdr->SessionId = tcon->ses->Suid; - /* BB check following DFS flags BB */ - /* BB do we have to add check for SHI1005_FLAGS_DFS_ROOT too? */ - if (tcon->share_flags & SHI1005_FLAGS_DFS) - hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; - /* BB how does SMB2 do case sensitive? */ - /* if (tcon->nocase) - hdr->Flags |= SMBFLG_CASELESS; */ - if (tcon->ses && tcon->ses->server && - (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED)) + + /* + * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have + * to pass the path on the Open SMB prefixed by \\server\share. + * Not sure when we would need to do the augmented path (if ever) and + * setting this flag breaks the SMB2 open operation since it is + * illegal to send an empty path name (without \\server\share prefix) + * when the DFS flag is set in the SMB open header. We could + * consider setting the flag on all operations other than open + * but it is safer to net set it for now. + */ +/* if (tcon->share_flags & SHI1005_FLAGS_DFS) + hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */ + + if (tcon->ses && tcon->ses->server && tcon->ses->server->sign) hdr->Flags |= SMB2_FLAGS_SIGNED; out: pdu->StructureSize2 = cpu_to_le16(parmsize); @@ -328,34 +342,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) struct kvec iov[1]; int rc = 0; int resp_buftype; - struct TCP_Server_Info *server; - unsigned int sec_flags; - u16 temp = 0; + struct TCP_Server_Info *server = ses->server; int blob_offset, blob_length; char *security_blob; int flags = CIFS_NEG_OP; cifs_dbg(FYI, "Negotiate protocol\n"); - if (ses->server) - server = ses->server; - else { - rc = -EIO; - return rc; + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; } rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req); if (rc) return rc; - /* if any of auth flags (ie not sign or seal) are overriden use them */ - if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) - sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/ - else /* if override flags set only sign/seal OR them with global auth */ - sec_flags = global_secflags | ses->overrideSecFlg; - - cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags); - req->hdr.SessionId = 0; req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); @@ -364,12 +366,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) inc_rfc1001_len(req, 2); /* only one of SMB2 signing flags may be set in SMB2 request */ - if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) - temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; - else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */ - temp = SMB2_NEGOTIATE_SIGNING_ENABLED; - - req->SecurityMode = cpu_to_le16(temp); + if (ses->sign) + req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); + else if (global_secflags & CIFSSEC_MAY_SIGN) + req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); + else + req->SecurityMode = 0; req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); @@ -399,6 +401,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_dbg(FYI, "negotiated smb2.1 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID)) cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); + else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID)) + cifs_dbg(FYI, "negotiated smb3.02 dialect\n"); else { cifs_dbg(VFS, "Illegal dialect returned by server %d\n", le16_to_cpu(rsp->DialectRevision)); @@ -407,6 +411,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) } server->dialect = le16_to_cpu(rsp->DialectRevision); + /* SMB2 only has an extended negflavor */ + server->negflavor = CIFS_NEGFLAVOR_EXTENDED; server->maxBuf = le32_to_cpu(rsp->MaxTransactSize); server->max_read = le32_to_cpu(rsp->MaxReadSize); server->max_write = le32_to_cpu(rsp->MaxWriteSize); @@ -418,44 +424,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, &rsp->hdr); - if (blob_length == 0) { - cifs_dbg(VFS, "missing security blob on negprot\n"); - rc = -EIO; - goto neg_exit; - } - - cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags); - if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { - cifs_dbg(FYI, "Signing required\n"); - if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | - SMB2_NEGOTIATE_SIGNING_ENABLED))) { - cifs_dbg(VFS, "signing required but server lacks support\n"); - rc = -EOPNOTSUPP; - goto neg_exit; - } - server->sec_mode |= SECMODE_SIGN_REQUIRED; - } else if (sec_flags & CIFSSEC_MAY_SIGN) { - cifs_dbg(FYI, "Signing optional\n"); - if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) { - cifs_dbg(FYI, "Server requires signing\n"); - server->sec_mode |= SECMODE_SIGN_REQUIRED; - } else { - server->sec_mode &= - ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); - } - } else { - cifs_dbg(FYI, "Signing disabled\n"); - if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) { - cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n"); - rc = -EOPNOTSUPP; - goto neg_exit; - } - server->sec_mode &= - ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); - } + /* + * See MS-SMB2 section 2.2.4: if no blob, client picks default which + * for us will be + * ses->sectype = RawNTLMSSP; + * but for time being this is our only auth choice so doesn't matter. + * We just found a server which sets blob length to zero expecting raw. + */ + if (blob_length == 0) + cifs_dbg(FYI, "missing security blob on negprot\n"); + rc = cifs_enable_signing(server, ses->sign); #ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */ - rc = decode_neg_token_init(security_blob, blob_length, + if (rc) + goto neg_exit; + if (blob_length) + rc = decode_neg_token_init(security_blob, blob_length, &server->sec_type); if (rc == 1) rc = 0; @@ -480,9 +464,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, int rc = 0; int resp_buftype; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ - struct TCP_Server_Info *server; - unsigned int sec_flags; - u8 temp = 0; + struct TCP_Server_Info *server = ses->server; u16 blob_length = 0; char *security_blob; char *ntlmssp_blob = NULL; @@ -490,11 +472,9 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "Session Setup\n"); - if (ses->server) - server = ses->server; - else { - rc = -EIO; - return rc; + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; } /* @@ -505,7 +485,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, if (!ses->ntlmssp) return -ENOMEM; - ses->server->secType = RawNTLMSSP; + /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ + ses->sectype = RawNTLMSSP; ssetup_ntlmssp_authenticate: if (phase == NtLmChallenge) @@ -515,28 +496,19 @@ ssetup_ntlmssp_authenticate: if (rc) return rc; - /* if any of auth flags (ie not sign or seal) are overriden use them */ - if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) - sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/ - else /* if override flags set only sign/seal OR them with global auth */ - sec_flags = global_secflags | ses->overrideSecFlg; - - cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags); - req->hdr.SessionId = 0; /* First session, not a reauthenticate */ req->VcNumber = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); /* only one of SMB2 signing flags may be set in SMB2 request */ - if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) - temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; - else if (ses->server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) - temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; - else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */ - temp = SMB2_NEGOTIATE_SIGNING_ENABLED; - - req->SecurityMode = temp; + if (server->sign) + req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED; + else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */ + req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED; + else + req->SecurityMode = 0; + req->Capabilities = 0; req->Channel = 0; /* MBZ */ @@ -679,7 +651,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) /* since no tcon, smb2_init can not do this, so do here */ req->hdr.SessionId = ses->Suid; - if (server->sec_mode & SECMODE_SIGN_REQUIRED) + if (server->sign) req->hdr.Flags |= SMB2_FLAGS_SIGNED; rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0); @@ -788,11 +760,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, } tcon->share_flags = le32_to_cpu(rsp->ShareFlags); + tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tidStatus = CifsGood; tcon->need_reconnect = false; tcon->tid = rsp->hdr.TreeId; - strncpy(tcon->treeName, tree, MAX_TREE_SIZE); + strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) @@ -1036,6 +1009,122 @@ creat_exit: return rc; } +/* + * SMB2 IOCTL is used for both IOCTLs and FSCTLs + */ +int +SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, + u32 indatalen, char **out_data, u32 *plen /* returned data len */) +{ + struct smb2_ioctl_req *req; + struct smb2_ioctl_rsp *rsp; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[2]; + int resp_buftype; + int num_iovecs; + int rc = 0; + + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req); + if (rc) + return rc; + + req->CtlCode = cpu_to_le32(opcode); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; + + if (indatalen) { + req->InputCount = cpu_to_le32(indatalen); + /* do not set InputOffset if no input data */ + req->InputOffset = + cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4); + iov[1].iov_base = in_data; + iov[1].iov_len = indatalen; + num_iovecs = 2; + } else + num_iovecs = 1; + + req->OutputOffset = 0; + req->OutputCount = 0; /* MBZ */ + + /* + * Could increase MaxOutputResponse, but that would require more + * than one credit. Windows typically sets this smaller, but for some + * ioctls it may be useful to allow server to send more. No point + * limiting what the server can send as long as fits in one credit + */ + req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ + + if (is_fsctl) + req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); + else + req->Flags = 0; + + iov[0].iov_base = (char *)req; + /* 4 for rfc1002 length field */ + iov[0].iov_len = get_rfc1002_length(req) + 4; + + if (indatalen) + inc_rfc1001_len(req, indatalen); + + rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); + rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; + + if (rc != 0) { + if (tcon) + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + goto ioctl_exit; + } + + /* check if caller wants to look at return data or just return rc */ + if ((plen == NULL) || (out_data == NULL)) + goto ioctl_exit; + + *plen = le32_to_cpu(rsp->OutputCount); + + /* We check for obvious errors in the output buffer length and offset */ + if (*plen == 0) + goto ioctl_exit; /* server returned no data */ + else if (*plen > 0xFF00) { + cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen); + *plen = 0; + rc = -EIO; + goto ioctl_exit; + } + + if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) { + cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen, + le32_to_cpu(rsp->OutputOffset)); + *plen = 0; + rc = -EIO; + goto ioctl_exit; + } + + *out_data = kmalloc(*plen, GFP_KERNEL); + if (*out_data == NULL) { + rc = -ENOMEM; + goto ioctl_exit; + } + + memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), + *plen); +ioctl_exit: + free_rsp_buf(resp_buftype, rsp); + return rc; +} + int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) @@ -1384,8 +1473,7 @@ smb2_readv_callback(struct mid_q_entry *mid) case MID_RESPONSE_RECEIVED: credits_received = le16_to_cpu(buf->CreditRequest); /* result already set, check signature */ - if (server->sec_mode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (server->sign) { int rc; rc = smb2_verify_signature(&rqst, server); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4cb4ced..f31043b 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1,7 +1,7 @@ /* * fs/cifs/smb2pdu.h * - * Copyright (c) International Business Machines Corp., 2009, 2010 + * Copyright (c) International Business Machines Corp., 2009, 2013 * Etersoft, 2012 * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 @@ -170,6 +170,7 @@ struct smb2_negotiate_req { #define SMB20_PROT_ID 0x0202 #define SMB21_PROT_ID 0x0210 #define SMB30_PROT_ID 0x0300 +#define SMB302_PROT_ID 0x0302 #define BAD_PROT_ID 0xFFFF /* SecurityMode flags */ @@ -283,10 +284,17 @@ struct smb2_tree_connect_rsp { #define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 #define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 #define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 -#define SHI1005_FLAGS_ENABLE_HASH 0x00002000 +#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 +#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 +#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 +#define SHI1005_FLAGS_ALL 0x0000FF33 /* Possible share capabilities */ -#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) +#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ +#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ +#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ +#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ +#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ struct smb2_tree_disconnect_req { struct smb2_hdr hdr; @@ -477,6 +485,75 @@ struct create_lease { struct lease_context lcontext; } __packed; +/* this goes in the ioctl buffer when doing a copychunk request */ +struct copychunk_ioctl { + char SourceKey[24]; + __le32 ChunkCount; /* we are only sending 1 */ + __le32 Reserved; + /* array will only be one chunk long for us */ + __le64 SourceOffset; + __le64 TargetOffset; + __le32 Length; /* how many bytes to copy */ + __u32 Reserved2; +} __packed; + +/* Response and Request are the same format */ +struct validate_negotiate_info { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 DialectCount; + __le16 Dialect[1]; +} __packed; + +#define RSS_CAPABLE 0x00000001 +#define RDMA_CAPABLE 0x00000002 + +struct network_interface_info_ioctl_rsp { + __le32 Next; /* next interface. zero if this is last one */ + __le32 IfIndex; + __le32 Capability; /* RSS or RDMA Capable */ + __le32 Reserved; + __le64 LinkSpeed; + char SockAddr_Storage[128]; +} __packed; + +#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */ + +struct smb2_ioctl_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u16 Reserved; + __le32 CtlCode; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 InputOffset; + __le32 InputCount; + __le32 MaxInputResponse; + __le32 OutputOffset; + __le32 OutputCount; + __le32 MaxOutputResponse; + __le32 Flags; + __u32 Reserved2; + char Buffer[0]; +} __packed; + +struct smb2_ioctl_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u16 Reserved; + __le32 CtlCode; + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ + __le32 InputOffset; + __le32 InputCount; + __le32 OutputOffset; + __le32 OutputCount; + __le32 Flags; + __u32 Reserved2; + /* char * buffer[] */ +} __packed; + /* Currently defined values for close flags */ #define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) struct smb2_close_req { @@ -517,17 +594,25 @@ struct smb2_flush_rsp { __le16 Reserved; } __packed; +/* For read request Flags field below, following flag is defined for SMB3.02 */ +#define SMB2_READFLAG_READ_UNBUFFERED 0x01 + +/* Channel field for read and write: exactly one of following flags can be set*/ +#define SMB2_CHANNEL_NONE 0x00000000 +#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */ +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */ + struct smb2_read_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 49 */ __u8 Padding; /* offset from start of SMB2 header to place read */ - __u8 Reserved; + __u8 Flags; /* MBZ unless SMB3.02 or later */ __le32 Length; __le64 Offset; __u64 PersistentFileId; /* opaque endianness */ __u64 VolatileFileId; /* opaque endianness */ __le32 MinimumCount; - __le32 Channel; /* Reserved MBZ */ + __le32 Channel; /* MBZ except for SMB3 or later */ __le32 RemainingBytes; __le16 ReadChannelInfoOffset; /* Reserved MBZ */ __le16 ReadChannelInfoLength; /* Reserved MBZ */ @@ -545,8 +630,9 @@ struct smb2_read_rsp { __u8 Buffer[1]; } __packed; -/* For write request Flags field below the following flag is defined: */ -#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 +/* For write request Flags field below the following flags are defined: */ +#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ +#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ struct smb2_write_req { struct smb2_hdr hdr; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 2aa3535..d4e1eb8 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -111,6 +111,10 @@ extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __u32 desired_access, __u32 create_disposition, __u32 file_attributes, __u32 create_options, __u8 *oplock, struct smb2_file_all_info *buf); +extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen, + char **out_data, u32 *plen /* returned data len */); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 01f0ac8..09b4fba 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -116,11 +116,155 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } +void +generate_smb3signingkey(struct TCP_Server_Info *server) +{ + unsigned char zero = 0x0; + __u8 i[4] = {0, 0, 0, 1}; + __u8 L[4] = {0, 0, 0, 128}; + int rc = 0; + unsigned char prfhash[SMB2_HMACSHA256_SIZE]; + unsigned char *hashptr = prfhash; + + memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); + memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + + rc = crypto_shash_setkey(server->secmech.hmacsha256, + server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set with session key\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + i, 4); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with n\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + "SMB2AESCMAC", 12); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with label\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + &zero, 1); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with zero\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + "SmbSign", 8); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with context\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + L, 4); + if (rc) { + cifs_dbg(VFS, "%s: Could not update with L\n", __func__); + goto smb3signkey_ret; + } + + rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, + hashptr); + if (rc) { + cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); + goto smb3signkey_ret; + } + + memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + +smb3signkey_ret: + return; +} + int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - cifs_dbg(FYI, "smb3 signatures not supported yet\n"); - return -EOPNOTSUPP; + int i, rc; + unsigned char smb3_signature[SMB2_CMACAES_SIZE]; + unsigned char *sigptr = smb3_signature; + struct kvec *iov = rqst->rq_iov; + int n_vec = rqst->rq_nvec; + struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + + memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); + memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); + + rc = crypto_shash_setkey(server->secmech.cmacaes, + server->smb3signingkey, SMB2_CMACAES_SIZE); + if (rc) { + cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); + return rc; + } + + rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash); + if (rc) { + cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__); + return rc; + } + + for (i = 0; i < n_vec; i++) { + if (iov[i].iov_len == 0) + continue; + if (iov[i].iov_base == NULL) { + cifs_dbg(VFS, "null iovec entry"); + return -EIO; + } + /* + * The first entry includes a length field (which does not get + * signed that occupies the first 4 bytes before the header). + */ + if (i == 0) { + if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ + break; /* nothing to sign or corrupt header */ + rc = + crypto_shash_update( + &server->secmech.sdesccmacaes->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); + } else { + rc = + crypto_shash_update( + &server->secmech.sdesccmacaes->shash, + iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n", + __func__); + return rc; + } + } + + /* now hash over the rq_pages array */ + for (i = 0; i < rqst->rq_npages; i++) { + struct kvec p_iov; + + cifs_rqst_page_to_kvec(rqst, i, &p_iov); + crypto_shash_update(&server->secmech.sdesccmacaes->shash, + p_iov.iov_base, p_iov.iov_len); + kunmap(rqst->rq_pages[i]); + } + + rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash, + sigptr); + if (rc) + cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__); + + memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE); + + return rc; } /* must be called with server->srv_mutex held */ @@ -275,8 +419,7 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, dump_smb(mid->resp_buf, min_t(u32, 80, len)); /* convert the length into a more usable form */ - if ((len > 24) && - (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) { + if (len > 24 && server->sign) { int rc; rc = smb2_verify_signature(&rqst, server); diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 7056b89..d952ee4 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -1,7 +1,7 @@ /* * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions * - * Copyright (c) International Business Machines Corp., 2002,2009 + * Copyright (c) International Business Machines Corp., 2002,2013 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -22,7 +22,7 @@ /* IOCTL information */ /* * List of ioctl/fsctl function codes that are or could be useful in the - * future to remote clients like cifs or SMB2 client. There is probably + * future to remote clients like cifs or SMB2/SMB3 client. This is probably * a slightly larger set of fsctls that NTFS local filesystem could handle, * including the seven below that we do not have struct definitions for. * Even with protocol definitions for most of these now available, we still @@ -30,7 +30,13 @@ * remotely. Some of the following, such as the encryption/compression ones * could be invoked from tools via a specialized hook into the VFS rather * than via the standard vfs entry points + * + * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are + * below). Additional detail on less common ones can be found in MS-FSCC + * section 2.3. */ +#define FSCTL_DFS_GET_REFERRALS 0x00060194 +#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0 #define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000 #define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004 #define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008 @@ -71,14 +77,31 @@ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ +#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ #define FSCTL_SIS_LINK_FILES 0x0009C104 #define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ #define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ /* strange that the number for this op is not sequential with previous op */ #define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */ +/* Enumerate previous versions of a file */ +#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064 +/* Retrieve an opaque file reference for server-side data movement ie copy */ +#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078 +#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */ #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ +#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */ +/* Perform server-side data movement */ +#define FSCTL_SRV_COPYCHUNK 0x001440F2 +#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2 +#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */ +#define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */ #define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 #define IO_REPARSE_TAG_HSM 0xC0000004 #define IO_REPARSE_TAG_SIS 0x80000007 + +/* fsctl flags */ +/* If Flags is set to this value, the request is an FSCTL not ioctl request */ +#define SMB2_0_IOCTL_IS_FSCTL 0x00000001 + diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index bfbf470..6fdcb1b 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -447,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) { int error; - error = wait_event_freezekillable(server->response_q, + error = wait_event_freezekillable_unsafe(server->response_q, midQ->mid_state != MID_REQUEST_SUBMITTED); if (error < 0) return -ERESTARTSYS; @@ -463,7 +463,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct mid_q_entry *mid; /* enable signing if server requires it */ - if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if (server->sign) hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; mid = AllocMidQEntry(hdr, server); @@ -612,7 +612,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, dump_smb(mid->resp_buf, min_t(u32, 92, len)); /* convert the length into a more usable form */ - if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + if (server->sign) { struct kvec iov; int rc = 0; struct smb_rqst rqst = { .rq_iov = &iov, diff --git a/fs/coda/dir.c b/fs/coda/dir.c index b7d3a05..190effc 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, struct inode *new_inode, struct dentry *new_dentry); /* dir file-ops */ -static int coda_readdir(struct file *file, void *buf, filldir_t filldir); +static int coda_readdir(struct file *file, struct dir_context *ctx); /* dentry ops */ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags); static int coda_dentry_delete(const struct dentry *); /* support routines */ -static int coda_venus_readdir(struct file *coda_file, void *buf, - filldir_t filldir); +static int coda_venus_readdir(struct file *, struct dir_context *); /* same as fs/bad_inode.c */ static int coda_return_EIO(void) @@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations = const struct file_operations coda_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = coda_readdir, + .iterate = coda_readdir, .open = coda_open, .release = coda_release, .fsync = coda_fsync, @@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, /* file operations for directories */ -static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) +static int coda_readdir(struct file *coda_file, struct dir_context *ctx) { struct coda_file_info *cfi; struct file *host_file; @@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) if (!host_file->f_op) return -ENOTDIR; - if (host_file->f_op->readdir) - { - /* potemkin case: we were handed a directory inode. - * We can't use vfs_readdir because we have to keep the file - * position in sync between the coda_file and the host_file. - * and as such we need grab the inode mutex. */ + if (host_file->f_op->iterate) { struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); - host_file->f_pos = coda_file->f_pos; - ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { - ret = host_file->f_op->readdir(host_file, buf, filldir); + ret = host_file->f_op->iterate(host_file, ctx); file_accessed(host_file); } - - coda_file->f_pos = host_file->f_pos; mutex_unlock(&host_inode->i_mutex); + return ret; } - else /* Venus: we must read Venus dirents from a file */ - ret = coda_venus_readdir(coda_file, buf, filldir); - - return ret; + /* Venus: we must read Venus dirents from a file */ + return coda_venus_readdir(coda_file, ctx); } static inline unsigned int CDT2DT(unsigned char cdt) @@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt) } /* support routines */ -static int coda_venus_readdir(struct file *coda_file, void *buf, - filldir_t filldir) +static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) { - int result = 0; /* # of entries returned */ struct coda_file_info *cfi; struct coda_inode_info *cii; struct file *host_file; @@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf, vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); if (!vdir) return -ENOMEM; - if (coda_file->f_pos == 0) { - ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR); - if (ret < 0) - goto out; - result++; - coda_file->f_pos++; - } - if (coda_file->f_pos == 1) { - ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR); - if (ret < 0) - goto out; - result++; - coda_file->f_pos++; - } + if (!dir_emit_dots(coda_file, ctx)) + goto out; + while (1) { /* read entries from the directory file */ - ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir, + ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir, sizeof(*vdir)); if (ret < 0) { printk(KERN_ERR "coda readdir: read dir %s failed %d\n", @@ -507,32 +482,23 @@ static int coda_venus_readdir(struct file *coda_file, void *buf, /* Make sure we skip '.' and '..', we already got those */ if (name.name[0] == '.' && (name.len == 1 || - (vdir->d_name[1] == '.' && name.len == 2))) + (name.name[1] == '.' && name.len == 2))) vdir->d_fileno = name.len = 0; /* skip null entries */ if (vdir->d_fileno && name.len) { - /* try to look up this entry in the dcache, that way - * userspace doesn't have to worry about breaking - * getcwd by having mismatched inode numbers for - * internal volume mountpoints. */ - ino = find_inode_number(de, &name); - if (!ino) ino = vdir->d_fileno; - + ino = vdir->d_fileno; type = CDT2DT(vdir->d_type); - ret = filldir(buf, name.name, name.len, - coda_file->f_pos, ino, type); - /* failure means no space for filling in this round */ - if (ret < 0) break; - result++; + if (!dir_emit(ctx, name.name, name.len, ino, type)) + break; } /* we'll always have progress because d_reclen is unsigned and * we've already established it is non-zero. */ - coda_file->f_pos += vdir->d_reclen; + ctx->pos += vdir->d_reclen; } out: kfree(vdir); - return result ? result : ret; + return 0; } /* called when a cache lookup succeeds */ @@ -560,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) if (cii->c_flags & C_FLUSH) coda_flag_inode_children(inode, C_FLUSH); - if (de->d_count > 1) + if (d_count(de) > 1) /* pretend it's valid, but don't change the flags */ goto out; diff --git a/fs/compat.c b/fs/compat.c index fc3b55d..6af20de 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -832,6 +832,7 @@ struct compat_old_linux_dirent { }; struct compat_readdir_callback { + struct dir_context ctx; struct compat_old_linux_dirent __user *dirent; int result; }; @@ -873,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd, { int error; struct fd f = fdget(fd); - struct compat_readdir_callback buf; + struct compat_readdir_callback buf = { + .ctx.actor = compat_fillonedir, + .dirent = dirent + }; if (!f.file) return -EBADF; - buf.result = 0; - buf.dirent = dirent; - - error = vfs_readdir(f.file, compat_fillonedir, &buf); + error = iterate_dir(f.file, &buf.ctx); if (buf.result) error = buf.result; @@ -897,6 +898,7 @@ struct compat_linux_dirent { }; struct compat_getdents_callback { + struct dir_context ctx; struct compat_linux_dirent __user *current_dir; struct compat_linux_dirent __user *previous; int count; @@ -951,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd, { struct fd f; struct compat_linux_dirent __user * lastdirent; - struct compat_getdents_callback buf; + struct compat_getdents_callback buf = { + .ctx.actor = compat_filldir, + .current_dir = dirent, + .count = count + }; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) @@ -961,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd, if (!f.file) return -EBADF; - buf.current_dir = dirent; - buf.previous = NULL; - buf.count = count; - buf.error = 0; - - error = vfs_readdir(f.file, compat_filldir, &buf); + error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - if (put_user(f.file->f_pos, &lastdirent->d_off)) + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; @@ -983,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd, #ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 struct compat_getdents_callback64 { + struct dir_context ctx; struct linux_dirent64 __user *current_dir; struct linux_dirent64 __user *previous; int count; @@ -1036,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, { struct fd f; struct linux_dirent64 __user * lastdirent; - struct compat_getdents_callback64 buf; + struct compat_getdents_callback64 buf = { + .ctx.actor = compat_filldir64, + .current_dir = dirent, + .count = count + }; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) @@ -1046,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, if (!f.file) return -EBADF; - buf.current_dir = dirent; - buf.previous = NULL; - buf.count = count; - buf.error = 0; - - error = vfs_readdir(f.file, compat_filldir64, &buf); + error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - typeof(lastdirent->d_off) d_off = f.file->f_pos; + typeof(lastdirent->d_off) d_off = buf.ctx.pos; if (__put_user_unaligned(d_off, &lastdirent->d_off)) error = -EFAULT; else diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 996cdc5..5d19acf 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -66,7 +66,6 @@ #include <linux/gigaset_dev.h> #ifdef CONFIG_BLOCK -#include <linux/loop.h> #include <linux/cdrom.h> #include <linux/fd.h> #include <scsi/scsi.h> @@ -954,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP) /* Socket level stuff */ COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK -/* loop */ -IGNORE_IOCTL(LOOP_CLR_FD) /* md calls this on random blockdevs */ IGNORE_IOCTL(RAID_VERSION) /* qemu/qemu-img might call these two on plain files for probing */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7aabc6a..5e7c60c1 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d) if (d->d_inode) simple_rmdir(parent->d_inode,d); - pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); + pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d)); dput(parent); } @@ -1532,84 +1532,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd) return (sd->s_mode >> 12) & 15; } -static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int configfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct super_block *sb = dentry->d_sb; struct configfs_dirent * parent_sd = dentry->d_fsdata; - struct configfs_dirent *cursor = filp->private_data; + struct configfs_dirent *cursor = file->private_data; struct list_head *p, *q = &cursor->s_sibling; ino_t ino = 0; - int i = filp->f_pos; - switch (i) { - case 0: - ino = dentry->d_inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - break; - filp->f_pos++; - i++; - /* fallthrough */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - break; - filp->f_pos++; - i++; - /* fallthrough */ - default: - if (filp->f_pos == 2) { - spin_lock(&configfs_dirent_lock); - list_move(q, &parent_sd->s_children); - spin_unlock(&configfs_dirent_lock); - } - for (p=q->next; p!= &parent_sd->s_children; p=p->next) { - struct configfs_dirent *next; - const char * name; - int len; - struct inode *inode = NULL; + if (!dir_emit_dots(file, ctx)) + return 0; + if (ctx->pos == 2) { + spin_lock(&configfs_dirent_lock); + list_move(q, &parent_sd->s_children); + spin_unlock(&configfs_dirent_lock); + } + for (p = q->next; p != &parent_sd->s_children; p = p->next) { + struct configfs_dirent *next; + const char *name; + int len; + struct inode *inode = NULL; + + next = list_entry(p, struct configfs_dirent, s_sibling); + if (!next->s_element) + continue; - next = list_entry(p, struct configfs_dirent, - s_sibling); - if (!next->s_element) - continue; - - name = configfs_get_name(next); - len = strlen(name); - - /* - * We'll have a dentry and an inode for - * PINNED items and for open attribute - * files. We lock here to prevent a race - * with configfs_d_iput() clearing - * s_dentry before calling iput(). - * - * Why do we go to the trouble? If - * someone has an attribute file open, - * the inode number should match until - * they close it. Beyond that, we don't - * care. - */ - spin_lock(&configfs_dirent_lock); - dentry = next->s_dentry; - if (dentry) - inode = dentry->d_inode; - if (inode) - ino = inode->i_ino; - spin_unlock(&configfs_dirent_lock); - if (!inode) - ino = iunique(sb, 2); + name = configfs_get_name(next); + len = strlen(name); + + /* + * We'll have a dentry and an inode for + * PINNED items and for open attribute + * files. We lock here to prevent a race + * with configfs_d_iput() clearing + * s_dentry before calling iput(). + * + * Why do we go to the trouble? If + * someone has an attribute file open, + * the inode number should match until + * they close it. Beyond that, we don't + * care. + */ + spin_lock(&configfs_dirent_lock); + dentry = next->s_dentry; + if (dentry) + inode = dentry->d_inode; + if (inode) + ino = inode->i_ino; + spin_unlock(&configfs_dirent_lock); + if (!inode) + ino = iunique(sb, 2); - if (filldir(dirent, name, len, filp->f_pos, ino, - dt_type(next)) < 0) - return 0; + if (!dir_emit(ctx, name, len, ino, dt_type(next))) + return 0; - spin_lock(&configfs_dirent_lock); - list_move(q, p); - spin_unlock(&configfs_dirent_lock); - p = q; - filp->f_pos++; - } + spin_lock(&configfs_dirent_lock); + list_move(q, p); + spin_unlock(&configfs_dirent_lock); + p = q; + ctx->pos++; } return 0; } @@ -1661,7 +1643,7 @@ const struct file_operations configfs_dir_operations = { .release = configfs_dir_close, .llseek = configfs_dir_lseek, .read = generic_read_dir, - .readdir = configfs_readdir, + .iterate = configfs_readdir, }; int configfs_register_subsystem(struct configfs_subsystem *subsys) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 2b6cb23..1d1c41f 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, count); + len = flush_write_buffer(file->f_path.dentry, buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); diff --git a/fs/coredump.c b/fs/coredump.c index dafafba..72f816d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -45,69 +45,79 @@ #include <trace/events/sched.h> int core_uses_pid; -char core_pattern[CORENAME_MAX_SIZE] = "core"; unsigned int core_pipe_limit; +char core_pattern[CORENAME_MAX_SIZE] = "core"; +static int core_name_size = CORENAME_MAX_SIZE; struct core_name { char *corename; int used, size; }; -static atomic_t call_count = ATOMIC_INIT(1); /* The maximal length of core_pattern is also specified in sysctl.c */ -static int expand_corename(struct core_name *cn) +static int expand_corename(struct core_name *cn, int size) { - char *old_corename = cn->corename; - - cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); - cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); + char *corename = krealloc(cn->corename, size, GFP_KERNEL); - if (!cn->corename) { - kfree(old_corename); + if (!corename) return -ENOMEM; - } + if (size > core_name_size) /* racy but harmless */ + core_name_size = size; + + cn->size = ksize(corename); + cn->corename = corename; return 0; } +static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) +{ + int free, need; + +again: + free = cn->size - cn->used; + need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + if (need < free) { + cn->used += need; + return 0; + } + + if (!expand_corename(cn, cn->size + need - free + 1)) + goto again; + + return -ENOMEM; +} + static int cn_printf(struct core_name *cn, const char *fmt, ...) { - char *cur; - int need; - int ret; va_list arg; + int ret; va_start(arg, fmt); - need = vsnprintf(NULL, 0, fmt, arg); + ret = cn_vprintf(cn, fmt, arg); va_end(arg); - if (likely(need < cn->size - cn->used - 1)) - goto out_printf; + return ret; +} - ret = expand_corename(cn); - if (ret) - goto expand_fail; +static int cn_esc_printf(struct core_name *cn, const char *fmt, ...) +{ + int cur = cn->used; + va_list arg; + int ret; -out_printf: - cur = cn->corename + cn->used; va_start(arg, fmt); - vsnprintf(cur, need + 1, fmt, arg); + ret = cn_vprintf(cn, fmt, arg); va_end(arg); - cn->used += need; - return 0; -expand_fail: + for (; cur < cn->used; ++cur) { + if (cn->corename[cur] == '/') + cn->corename[cur] = '!'; + } return ret; } -static void cn_escape(char *str) -{ - for (; *str; str++) - if (*str == '/') - *str = '!'; -} - static int cn_print_exe_file(struct core_name *cn) { struct file *exe_file; @@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn) int ret; exe_file = get_mm_exe_file(current->mm); - if (!exe_file) { - char *commstart = cn->corename + cn->used; - ret = cn_printf(cn, "%s (path unknown)", current->comm); - cn_escape(commstart); - return ret; - } + if (!exe_file) + return cn_esc_printf(cn, "%s (path unknown)", current->comm); pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { @@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn) goto free_buf; } - cn_escape(path); - - ret = cn_printf(cn, "%s", path); + ret = cn_esc_printf(cn, "%s", path); free_buf: kfree(pathbuf); @@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) int pid_in_pattern = 0; int err = 0; - cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); - cn->corename = kmalloc(cn->size, GFP_KERNEL); cn->used = 0; - - if (!cn->corename) + cn->corename = NULL; + if (expand_corename(cn, core_name_size)) return -ENOMEM; + cn->corename[0] = '\0'; + + if (ispipe) + ++pat_ptr; /* Repeat as long as we have more pattern to process and more output space */ while (*pat_ptr) { if (*pat_ptr != '%') { - if (*pat_ptr == 0) - goto out; err = cn_printf(cn, "%c", *pat_ptr++); } else { switch (*++pat_ptr) { @@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; } /* hostname */ - case 'h': { - char *namestart = cn->corename + cn->used; + case 'h': down_read(&uts_sem); - err = cn_printf(cn, "%s", + err = cn_esc_printf(cn, "%s", utsname()->nodename); up_read(&uts_sem); - cn_escape(namestart); break; - } /* executable */ - case 'e': { - char *commstart = cn->corename + cn->used; - err = cn_printf(cn, "%s", current->comm); - cn_escape(commstart); + case 'e': + err = cn_esc_printf(cn, "%s", current->comm); break; - } case 'E': err = cn_print_exe_file(cn); break; @@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) return err; } +out: /* Backward compatibility with core_uses_pid: * * If core_pattern does not include a %p (as is the default) @@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) if (err) return err; } -out: return ispipe; } @@ -549,7 +547,7 @@ void do_coredump(siginfo_t *siginfo) if (ispipe < 0) { printk(KERN_WARNING "format_corename failed\n"); printk(KERN_WARNING "Aborting core\n"); - goto fail_corename; + goto fail_unlock; } if (cprm.limit == 1) { @@ -584,7 +582,7 @@ void do_coredump(siginfo_t *siginfo) goto fail_dropcount; } - helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); + helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); @@ -601,7 +599,7 @@ void do_coredump(siginfo_t *siginfo) argv_free(helper_argv); if (retval) { - printk(KERN_INFO "Core dump to %s pipe failed\n", + printk(KERN_INFO "Core dump to |%s pipe failed\n", cn.corename); goto close_fail; } @@ -669,7 +667,6 @@ fail_dropcount: atomic_dec(&core_dump_count); fail_unlock: kfree(cn.corename); -fail_corename: coredump_finish(mm, core_dumped); revert_creds(old_cred); fail_creds: diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 35b1c7b..e501ac3 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* * Read a cramfs directory entry. */ -static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int cramfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; char *buf; unsigned int offset; - int copied; /* Offset within the thing. */ - offset = filp->f_pos; - if (offset >= inode->i_size) + if (ctx->pos >= inode->i_size) return 0; + offset = ctx->pos; /* Directory entries are always 4-byte aligned */ if (offset & 3) return -EINVAL; @@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (!buf) return -ENOMEM; - copied = 0; while (offset < inode->i_size) { struct cramfs_inode *de; unsigned long nextoffset; char *name; ino_t ino; umode_t mode; - int namelen, error; + int namelen; mutex_lock(&read_mutex); de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); @@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; namelen--; } - error = filldir(dirent, buf, namelen, offset, ino, mode >> 12); - if (error) + if (!dir_emit(ctx, buf, namelen, ino, mode >> 12)) break; - offset = nextoffset; - filp->f_pos = offset; - copied++; + ctx->pos = offset = nextoffset; } kfree(buf); return 0; @@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = { static const struct file_operations cramfs_directory_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = cramfs_readdir, + .iterate = cramfs_readdir, }; static const struct inode_operations cramfs_dir_inode_operations = { diff --git a/fs/dcache.c b/fs/dcache.c index f09b908..87bdb53 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1612,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias); * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { @@ -1636,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) security_d_instantiate(dentry, inode); d_rehash(dentry); } - } else - d_add(dentry, inode); + } else { + d_instantiate(dentry, inode); + if (d_unhashed(dentry)) + d_rehash(dentry); + } return new; } EXPORT_SYMBOL(d_splice_alias); @@ -1723,7 +1730,7 @@ EXPORT_SYMBOL(d_add_ci); * Do the slow-case of the dentry name compare. * * Unlike the dentry_cmp() function, we need to atomically - * load the name, length and inode information, so that the + * load the name and length information, so that the * filesystem can rely on them, and can use the 'name' and * 'len' information without worrying about walking off the * end of memory etc. @@ -1741,22 +1748,18 @@ enum slow_d_compare { static noinline enum slow_d_compare slow_dentry_cmp( const struct dentry *parent, - struct inode *inode, struct dentry *dentry, unsigned int seq, const struct qstr *name) { int tlen = dentry->d_name.len; const char *tname = dentry->d_name.name; - struct inode *i = dentry->d_inode; if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); return D_COMP_SEQRETRY; } - if (parent->d_op->d_compare(parent, inode, - dentry, i, - tlen, tname, name)) + if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) return D_COMP_NOMATCH; return D_COMP_OK; } @@ -1766,7 +1769,6 @@ static noinline enum slow_d_compare slow_dentry_cmp( * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found - * @inode: returns dentry->d_inode when the inode was found valid. * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name @@ -1793,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, - unsigned *seqp, struct inode *inode) + unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; @@ -1827,11 +1829,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, seqretry: /* * The dentry sequence count protects us from concurrent - * renames, and thus protects inode, parent and name fields. + * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order - * to do anything useful with the returned dentry, - * including using the 'd_inode' pointer. + * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it @@ -1845,12 +1846,12 @@ seqretry: continue; if (d_unhashed(dentry)) continue; - *seqp = seq; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; - switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) { + *seqp = seq; + switch (slow_dentry_cmp(parent, dentry, seq, name)) { case D_COMP_OK: return dentry; case D_COMP_NOMATCH: @@ -1862,6 +1863,7 @@ seqretry: if (dentry->d_name.hash_len != hashlen) continue; + *seqp = seq; if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) return dentry; } @@ -1959,9 +1961,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) if (parent->d_flags & DCACHE_OP_COMPARE) { int tlen = dentry->d_name.len; const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, parent->d_inode, - dentry, dentry->d_inode, - tlen, tname, name)) + if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) goto next; } else { if (dentry->d_name.len != len) @@ -1998,7 +1998,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) */ name->hash = full_name_hash(name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { - int err = dir->d_op->d_hash(dir, dir->d_inode, name); + int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } @@ -2968,34 +2968,21 @@ rename_retry: goto again; } -/** - * find_inode_number - check for dentry with name - * @dir: directory to check - * @name: Name to find. - * - * Check whether a dentry already exists for the given name, - * and return the inode number if it has an inode. Otherwise - * 0 is returned. - * - * This routine is used to post-process directory listings for - * filesystems using synthetic inode numbers, and is necessary - * to keep getcwd() working. - */ - -ino_t find_inode_number(struct dentry *dir, struct qstr *name) +void d_tmpfile(struct dentry *dentry, struct inode *inode) { - struct dentry * dentry; - ino_t ino = 0; - - dentry = d_hash_and_lookup(dir, name); - if (!IS_ERR_OR_NULL(dentry)) { - if (dentry->d_inode) - ino = dentry->d_inode->i_ino; - dput(dentry); - } - return ino; + inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_alias) || + !d_unlinked(dentry)); + spin_lock(&dentry->d_parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); + d_instantiate(dentry, inode); } -EXPORT_SYMBOL(find_inode_number); +EXPORT_SYMBOL(d_tmpfile); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index c5ca6ae..6314629 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -21,6 +21,7 @@ #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> +#include <linux/atomic.h> static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_size_t); +static int debugfs_atomic_t_set(void *data, u64 val) +{ + atomic_set((atomic_t *)data, val); + return 0; +} +static int debugfs_atomic_t_get(void *data, u64 *val) +{ + *val = atomic_read((atomic_t *)data); + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, + debugfs_atomic_t_set, "%lld\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); + +/** + * debugfs_create_atomic_t - create a debugfs file that is used to read and + * write an atomic_t value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, + struct dentry *parent, atomic_t *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, + &fops_atomic_t_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, + &fops_atomic_t_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_atomic_t); +} +EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); static ssize_t read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) @@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; + buf[buf_size] = '\0'; if (strtobool(buf, &bv) == 0) *val = bv; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 7d58d5b..76feb4b 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf) static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, const char *buf, size_t len) { - strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN); - strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN); + strlcpy(dlm_config.ci_cluster_name, buf, + sizeof(dlm_config.ci_cluster_name)); + strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); return len; } diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 1b11466..e223a91 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; if (b == 1) { int len = receive_extralen(ms); - if (len > DLM_RESNAME_MAXLEN) - len = DLM_RESNAME_MAXLEN; + if (len > r->res_ls->ls_lvblen) + len = r->res_ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); lkb->lkb_lvbseq = ms->m_lvbseq; } @@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, if (!lkb->lkb_lvbptr) return -ENOMEM; len = receive_extralen(ms); - if (len > DLM_RESNAME_MAXLEN) - len = DLM_RESNAME_MAXLEN; + if (len > ls->ls_lvblen) + len = ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); } return 0; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 3ca79d3..88556dc 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force) void dlm_stop_lockspaces(void) { struct dlm_ls *ls; + int count; restart: + count = 0; spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { - if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) + if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) { + count++; continue; + } spin_unlock(&lslist_lock); log_error(ls, "no userland control daemon, stopping lockspace"); dlm_ls_stop(ls); goto restart; } spin_unlock(&lslist_lock); + + if (count) + log_print("dlm user daemon left %d lockspaces", count); } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d0ccd2f..d90909e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -52,7 +52,6 @@ #include <linux/mutex.h> #include <linux/sctp.h> #include <linux/slab.h> -#include <linux/sctp.h> #include <net/sctp/sctp.h> #include <net/ipv6.h> @@ -126,6 +125,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ + bool try_new_addr; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -144,6 +144,7 @@ struct dlm_node_addr { struct list_head list; int nodeid; int addr_count; + int curr_addr_index; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; @@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) } static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, - struct sockaddr *sa_out) + struct sockaddr *sa_out, bool try_new_addr) { struct sockaddr_storage sas; struct dlm_node_addr *na; @@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, spin_lock(&dlm_node_addrs_spin); na = find_node_addr(nodeid); - if (na && na->addr_count) - memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage)); + if (na && na->addr_count) { + if (try_new_addr) { + na->curr_addr_index++; + if (na->curr_addr_index == na->addr_count) + na->curr_addr_index = 0; + } + + memcpy(&sas, na->addr[na->curr_addr_index ], + sizeof(struct sockaddr_storage)); + } spin_unlock(&dlm_node_addrs_spin); if (!na) @@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) { struct dlm_node_addr *na; int rv = -EEXIST; + int addr_i; spin_lock(&dlm_node_addrs_spin); list_for_each_entry(na, &dlm_node_addrs, list) { if (!na->addr_count) continue; - if (!addr_compare(na->addr[0], addr)) - continue; - - *nodeid = na->nodeid; - rv = 0; - break; + for (addr_i = 0; addr_i < na->addr_count; addr_i++) { + if (addr_compare(na->addr[addr_i], addr)) { + *nodeid = na->nodeid; + rv = 0; + goto unlock; + } + } } +unlock: spin_unlock(&dlm_node_addrs_spin); return rv; } @@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd) static void sctp_init_failed_foreach(struct connection *con) { + + /* + * Don't try to recover base con and handle race where the + * other node's assoc init creates a assoc and we get that + * notification, then we get a notification that our attempt + * failed due. This happens when we are still trying the primary + * address, but the other node has already tried secondary addrs + * and found one that worked. + */ + if (!con->nodeid || con->sctp_assoc) + return; + + log_print("Retrying SCTP association init for node %d\n", con->nodeid); + + con->try_new_addr = true; con->sctp_assoc = 0; - if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { + if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) { if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) queue_work(send_workqueue, &con->swork); } @@ -579,15 +606,56 @@ static void sctp_init_failed(void) mutex_unlock(&connections_lock); } +static void retry_failed_sctp_send(struct connection *recv_con, + struct sctp_send_failed *sn_send_failed, + char *buf) +{ + int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed); + struct dlm_mhandle *mh; + struct connection *con; + char *retry_buf; + int nodeid = sn_send_failed->ssf_info.sinfo_ppid; + + log_print("Retry sending %d bytes to node id %d", len, nodeid); + + con = nodeid2con(nodeid, 0); + if (!con) { + log_print("Could not look up con for nodeid %d\n", + nodeid); + return; + } + + mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf); + if (!mh) { + log_print("Could not allocate buf for retry."); + return; + } + memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len); + dlm_lowcomms_commit_buffer(mh); + + /* + * If we got a assoc changed event before the send failed event then + * we only need to retry the send. + */ + if (con->sctp_assoc) { + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); + } else + sctp_init_failed_foreach(con); +} + /* Something happened to an association */ static void process_sctp_notification(struct connection *con, struct msghdr *msg, char *buf) { union sctp_notification *sn = (union sctp_notification *)buf; - if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { + switch (sn->sn_header.sn_type) { + case SCTP_SEND_FAILED: + retry_failed_sctp_send(con, &sn->sn_send_failed, buf); + break; + case SCTP_ASSOC_CHANGE: switch (sn->sn_assoc_change.sac_state) { - case SCTP_COMM_UP: case SCTP_RESTART: { @@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con, log_print("connecting to %d sctp association %d", nodeid, (int)sn->sn_assoc_change.sac_assoc_id); + new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id; + new_con->try_new_addr = false; /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); - clear_bit(CF_INIT_PENDING, &con->flags); + clear_bit(CF_INIT_PENDING, &new_con->flags); if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { queue_work(send_workqueue, &new_con->swork); } @@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con, } break; - /* We don't know which INIT failed, so clear the PENDING flags - * on them all. if assoc_id is zero then it will then try - * again */ - case SCTP_CANT_STR_ASSOC: { + /* Will retry init when we get the send failed notification */ log_print("Can't start SCTP association - retrying"); - sctp_init_failed(); } break; @@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con, (int)sn->sn_assoc_change.sac_assoc_id, sn->sn_assoc_change.sac_state); } + default: + ; /* fall through */ } } @@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e) kfree(e); } +/* + * writequeue_entry_complete - try to delete and free write queue entry + * @e: write queue entry to try to delete + * @completed: bytes completed + * + * writequeue_lock must be held. + */ +static void writequeue_entry_complete(struct writequeue_entry *e, int completed) +{ + e->offset += completed; + e->len -= completed; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + } +} + /* Initiate an SCTP association. This is a special case of send_to_sock() in that we don't yet have a peeled-off socket for this association, so we use the listening socket @@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con) int addrlen; struct kvec iov[1]; + mutex_lock(&con->sock_mutex); if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) - return; - - if (con->retries++ > MAX_CONNECT_RETRIES) - return; + goto unlock; - if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) { + if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr, + con->try_new_addr)) { log_print("no address for nodeid %d", con->nodeid); - return; + goto unlock; } base_con = nodeid2con(0, 0); BUG_ON(base_con == NULL); @@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con) if (list_empty(&con->writequeue)) { spin_unlock(&con->writequeue_lock); log_print("writequeue empty for nodeid %d", con->nodeid); - return; + goto unlock; } e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; - spin_unlock(&con->writequeue_lock); /* Send the first block off the write queue */ iov[0].iov_base = page_address(e->page)+offset; iov[0].iov_len = len; + spin_unlock(&con->writequeue_lock); + + if (rem_addr.ss_family == AF_INET) { + struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr; + log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr); + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr; + log_print("Trying to connect to %pI6", &sin6->sin6_addr); + } cmsg = CMSG_FIRSTHDR(&outmessage); cmsg->cmsg_level = IPPROTO_SCTP; @@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con) cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid()); + sinfo->sinfo_ppid = cpu_to_le32(con->nodeid); outmessage.msg_controllen = cmsg->cmsg_len; + sinfo->sinfo_flags |= SCTP_ADDR_OVER; ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); if (ret < 0) { @@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con) } else { spin_lock(&con->writequeue_lock); - e->offset += ret; - e->len -= ret; - - if (e->len == 0 && e->users == 0) { - list_del(&e->list); - free_entry(e); - } + writequeue_entry_complete(e, ret); spin_unlock(&con->writequeue_lock); } + +unlock: + mutex_unlock(&con->sock_mutex); } /* Connect a new socket to its peer */ @@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL); + result = nodeid_to_addr(con->nodeid, &saddr, NULL, false); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out_err; @@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void) int result = -EINVAL, num = 1, i, addr_len; struct connection *con = nodeid2con(0, GFP_NOFS); int bufsize = NEEDED_RMEM; + int one = 1; if (!con) return -ENOMEM; @@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void) goto create_delsock; } + result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, + sizeof(one)); + if (result < 0) + log_print("Could not set SCTP NODELAY error %d\n", result); + /* Init con struct */ sock->sk->sk_user_data = con; con->sock = sock; @@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con) } spin_lock(&con->writequeue_lock); - e->offset += ret; - e->len -= ret; - - if (e->len == 0 && e->users == 0) { - list_del(&e->list); - free_entry(e); - } + writequeue_entry_complete(e, ret); } spin_unlock(&con->writequeue_lock); out: diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f71ec12..cfa109a 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -2243,12 +2243,11 @@ out: */ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, size_t *plaintext_name_size, - struct dentry *ecryptfs_dir_dentry, + struct super_block *sb, const char *name, size_t name_size) { struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private( - ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; + &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; char *decoded_name; size_t decoded_name_size; size_t packet_size; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index f622a73..df19d34 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -575,7 +575,7 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, struct inode *ecryptfs_inode); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, size_t *decrypted_name_size, - struct dentry *ecryptfs_dentry, + struct super_block *sb, const char *name, size_t name_size); int ecryptfs_fill_zeros(struct file *file, loff_t new_length); int ecryptfs_encrypt_and_encode_filename( diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 201f0a0..24f1105 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -68,9 +68,9 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, } struct ecryptfs_getdents_callback { - void *dirent; - struct dentry *dentry; - filldir_t filldir; + struct dir_context ctx; + struct dir_context *caller; + struct super_block *sb; int filldir_called; int entries_written; }; @@ -88,7 +88,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, buf->filldir_called++; rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, - buf->dentry, lower_name, + buf->sb, lower_name, lower_namelen); if (rc) { printk(KERN_ERR "%s: Error attempting to decode and decrypt " @@ -96,9 +96,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, rc); goto out; } - rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); + buf->caller->pos = buf->ctx.pos; + rc = !dir_emit(buf->caller, name, name_size, ino, d_type); kfree(name); - if (rc >= 0) + if (!rc) buf->entries_written++; out: return rc; @@ -107,27 +108,22 @@ out: /** * ecryptfs_readdir * @file: The eCryptfs directory file - * @dirent: Directory entry handle - * @filldir: The filldir callback function + * @ctx: The actor to feed the entries to */ -static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) { int rc; struct file *lower_file; - struct inode *inode; - struct ecryptfs_getdents_callback buf; - + struct inode *inode = file_inode(file); + struct ecryptfs_getdents_callback buf = { + .ctx.actor = ecryptfs_filldir, + .caller = ctx, + .sb = inode->i_sb, + }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = file->f_pos; - inode = file_inode(file); - memset(&buf, 0, sizeof(buf)); - buf.dirent = dirent; - buf.dentry = file->f_path.dentry; - buf.filldir = filldir; - buf.filldir_called = 0; - buf.entries_written = 0; - rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf); - file->f_pos = lower_file->f_pos; + lower_file->f_pos = ctx->pos; + rc = iterate_dir(lower_file, &buf.ctx); + ctx->pos = buf.ctx.pos; if (rc < 0) goto out; if (buf.filldir_called && !buf.entries_written) @@ -295,6 +291,12 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + int rc; + + rc = filemap_write_and_wait(file->f_mapping); + if (rc) + return rc; + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } @@ -338,7 +340,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) #endif const struct file_operations ecryptfs_dir_fops = { - .readdir = ecryptfs_readdir, + .iterate = ecryptfs_readdir, .read = generic_read_dir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT @@ -359,7 +361,7 @@ const struct file_operations ecryptfs_main_fops = { .aio_read = ecryptfs_read_update_atime, .write = do_sync_write, .aio_write = generic_file_aio_write, - .readdir = ecryptfs_readdir, + .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5eab400..67e9b63 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); - BUG_ON(!lower_dentry->d_count); + BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); ecryptfs_set_dentry_lower(dentry, lower_dentry); @@ -679,7 +679,7 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, set_fs(old_fs); if (rc < 0) goto out; - rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, + rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb, lower_buf, rc); out: kfree(lower_buf); diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index bfb5315..8dd524f 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -44,8 +44,11 @@ static ssize_t efivarfs_file_write(struct file *file, bytes = efivar_entry_set_get_size(var, attributes, &datasize, data, &set); - if (!set && bytes) + if (!set && bytes) { + if (bytes == -ENOENT) + bytes = -EIO; goto out; + } if (bytes == -ENOENT) { drop_nlink(inode); @@ -76,7 +79,14 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, int err; err = efivar_entry_size(var, &datasize); - if (err) + + /* + * efivarfs represents uncommitted variables with + * zero-length files. Reading them should return EOF. + */ + if (err == -ENOENT) + return 0; + else if (err) return err; data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 141aee3..a8766b8 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -45,8 +45,8 @@ static struct super_block *efivarfs_sb; * So we need to perform a case-sensitive match on part 1 and a * case-insensitive match on part 2. */ -static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int efivarfs_d_compare(const struct dentry *parent, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { @@ -63,8 +63,7 @@ static int efivarfs_d_compare(const struct dentry *parent, const struct inode *p return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); } -static int efivarfs_d_hash(const struct dentry *dentry, - const struct inode *inode, struct qstr *qstr) +static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) { unsigned long hash = init_name_hash(); const unsigned char *s = qstr->name; @@ -108,7 +107,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) q.name = name; q.len = strlen(name); - err = efivarfs_d_hash(NULL, NULL, &q); + err = efivarfs_d_hash(NULL, &q); if (err) return ERR_PTR(err); diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 055a9e9..b72307c 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -7,40 +7,38 @@ #include <linux/buffer_head.h> #include "efs.h" -static int efs_readdir(struct file *, void *, filldir_t); +static int efs_readdir(struct file *, struct dir_context *); const struct file_operations efs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = efs_readdir, + .iterate = efs_readdir, }; const struct inode_operations efs_dir_inode_operations = { .lookup = efs_lookup, }; -static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = file_inode(filp); - struct buffer_head *bh; - - struct efs_dir *dirblock; - struct efs_dentry *dirslot; - efs_ino_t inodenum; +static int efs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); efs_block_t block; - int slot, namelen; - char *nameptr; + int slot; if (inode->i_size & (EFS_DIRBSIZE-1)) printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); /* work out where this entry can be found */ - block = filp->f_pos >> EFS_DIRBSIZE_BITS; + block = ctx->pos >> EFS_DIRBSIZE_BITS; /* each block contains at most 256 slots */ - slot = filp->f_pos & 0xff; + slot = ctx->pos & 0xff; /* look at all blocks */ while (block < inode->i_blocks) { + struct efs_dir *dirblock; + struct buffer_head *bh; + /* read the dir block */ bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); @@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { break; } - while (slot < dirblock->slots) { - if (dirblock->space[slot] == 0) { - slot++; + for (; slot < dirblock->slots; slot++) { + struct efs_dentry *dirslot; + efs_ino_t inodenum; + const char *nameptr; + int namelen; + + if (dirblock->space[slot] == 0) continue; - } dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); @@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { #ifdef DEBUG printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); #endif - if (namelen > 0) { - /* found the next entry */ - filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; - - /* copy filename and data in dirslot */ - filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN); - - /* sanity check */ - if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { - printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); - slot++; - continue; - } - - /* store position of next slot */ - if (++slot == dirblock->slots) { - slot = 0; - block++; - } + if (!namelen) + continue; + /* found the next entry */ + ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot; + + /* sanity check */ + if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { + printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); + continue; + } + + /* copy filename and data in dirslot */ + if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) { brelse(bh); - filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; - goto out; + return 0; } - slot++; } brelse(bh); slot = 0; block++; } - - filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; -out: + ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot; return 0; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index deecc72..9ad17b15 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -34,6 +34,7 @@ #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/io.h> #include <asm/mman.h> @@ -1602,7 +1603,8 @@ fetch_events: } spin_unlock_irqrestore(&ep->lock, flags); - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!freezable_schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irqsave(&ep->lock, flags); @@ -1975,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, return -EINVAL; if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); } error = sys_epoll_wait(epfd, events, maxevents, timeout); @@ -1993,7 +1995,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, sizeof(sigsaved)); set_restore_sigmask(); } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + set_current_blocked(&sigsaved); } return error; @@ -2020,8 +2022,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) return -EFAULT; sigset_from_compat(&ksigmask, &csigmask); - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); } err = sys_epoll_wait(epfd, events, maxevents, timeout); @@ -2038,7 +2040,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, sizeof(sigsaved)); set_restore_sigmask(); } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + set_current_blocked(&sigsaved); } return err; @@ -110,13 +110,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, - .intent = LOOKUP_OPEN + .intent = LOOKUP_OPEN, + .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; - file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) @@ -756,10 +757,11 @@ struct file *open_exec(const char *name) static const struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, - .intent = LOOKUP_OPEN + .intent = LOOKUP_OPEN, + .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW); + file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -930,6 +932,7 @@ static int de_thread(struct task_struct *tsk) * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; + tsk->real_start_time = leader->real_start_time; BUG_ON(!same_thread_group(leader, tsk)); BUG_ON(has_group_leader_pid(tsk)); @@ -945,9 +948,8 @@ static int de_thread(struct task_struct *tsk) * Note: The old leader also uses this pid until release_task * is called. Odd but simple and correct. */ - detach_pid(tsk, PIDTYPE_PID); tsk->pid = leader->pid; - attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); + change_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); @@ -1135,13 +1137,6 @@ void setup_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - /* - * Flush performance counters when crossing a - * security domain: - */ - if (!get_dumpable(current->mm)) - perf_event_exit_task(current); - /* An exec changes our domain. We are no longer part of the thread group */ @@ -1205,6 +1200,15 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; + + /* + * Disable monitoring for regular users + * when executing setuid binaries. Must + * wait until new credentials are committed + * by commit_creds() above + */ + if (get_dumpable(current->mm) != SUID_DUMP_USER) + perf_event_exit_task(current); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's @@ -1461,7 +1465,6 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; bool clear_in_exec; int retval; - const struct cred *cred = current_cred(); /* * We move the actual failure in case of RLIMIT_NPROC excess from @@ -1470,7 +1473,7 @@ static int do_execve_common(const char *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) { + atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { retval = -EAGAIN; goto out_ret; } diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 4637589..49f51ab 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) } static int -exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) +exofs_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = file_inode(filp); + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); - unsigned char *types = NULL; - int need_revalidate = (filp->f_version != inode->i_version); + int need_revalidate = (file->f_version != inode->i_version); if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) return 0; - types = exofs_filetype_table; - for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct exofs_dir_entry *de; @@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (IS_ERR(page)) { EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_CACHE_SIZE - offset; return PTR_ERR(page); } kaddr = page_address(page); @@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (offset) { offset = exofs_validate_entry(kaddr, offset, chunk_mask); - filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; } - filp->f_version = inode->i_version; + file->f_version = inode->i_version; need_revalidate = 0; } de = (struct exofs_dir_entry *)(kaddr + offset); @@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) return -EIO; } if (de->inode_no) { - int over; - unsigned char d_type = DT_UNKNOWN; + unsigned char t; - if (types && de->file_type < EXOFS_FT_MAX) - d_type = types[de->file_type]; + if (de->file_type < EXOFS_FT_MAX) + t = exofs_filetype_table[de->file_type]; + else + t = DT_UNKNOWN; - offset = (char *)de - kaddr; - over = filldir(dirent, de->name, de->name_len, - (n<<PAGE_CACHE_SHIFT) | offset, + if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode_no), - d_type); - if (over) { + t)) { exofs_put_page(page); return 0; } } - filp->f_pos += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); } exofs_put_page(page); } - return 0; } @@ -669,5 +663,5 @@ not_empty: const struct file_operations exofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = exofs_readdir, + .iterate = exofs_readdir, }; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d1f80ab..2ec8eb1 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp) return 0; } -static void exofs_invalidatepage(struct page *page, unsigned long offset) +static void exofs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); + EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", + page->index, offset, length); WARN_ON(1); } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 262fc99..293bc2e 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -212,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf) } struct getdents_callback { + struct dir_context ctx; char *name; /* name that was found. It already points to a buffer NAME_MAX+1 is size */ unsigned long ino; /* the inum we are looking for */ @@ -254,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child) struct inode *dir = path->dentry->d_inode; int error; struct file *file; - struct getdents_callback buffer; + struct getdents_callback buffer = { + .ctx.actor = filldir_one, + .name = name, + .ino = child->d_inode->i_ino + }; error = -ENOTDIR; if (!dir || !S_ISDIR(dir->i_mode)) @@ -271,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child) goto out; error = -EINVAL; - if (!file->f_op->readdir) + if (!file->f_op->iterate) goto out_close; - buffer.name = name; - buffer.ino = child->d_inode->i_ino; - buffer.found = 0; buffer.sequence = 0; while (1) { int old_seq = buffer.sequence; - error = vfs_readdir(file, filldir_one, &buffer); + error = iterate_dir(file, &buffer.ctx); if (buffer.found) { error = 0; break; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 4237722bf..6e1d4ab 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) } static int -ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) +ext2_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = file_inode(filp); + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); unsigned char *types = NULL; - int need_revalidate = filp->f_version != inode->i_version; + int need_revalidate = file->f_version != inode->i_version; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) return 0; @@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) ext2_error(sb, __func__, "bad page in #%lu", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_CACHE_SIZE - offset; return PTR_ERR(page); } kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ext2_validate_entry(kaddr, offset, chunk_mask); - filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; } - filp->f_version = inode->i_version; + file->f_version = inode->i_version; need_revalidate = 0; } de = (ext2_dirent *)(kaddr+offset); @@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) return -EIO; } if (de->inode) { - int over; unsigned char d_type = DT_UNKNOWN; if (types && de->file_type < EXT2_FT_MAX) d_type = types[de->file_type]; - offset = (char *)de - kaddr; - over = filldir(dirent, de->name, de->name_len, - (n<<PAGE_CACHE_SHIFT) | offset, - le32_to_cpu(de->inode), d_type); - if (over) { + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + d_type)) { ext2_put_page(page); return 0; } } - filp->f_pos += ext2_rec_len_from_disk(de->rec_len); + ctx->pos += ext2_rec_len_from_disk(de->rec_len); } ext2_put_page(page); } @@ -724,7 +721,7 @@ not_empty: const struct file_operations ext2_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = ext2_readdir, + .iterate = ext2_readdir, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 73b0d95..256dd5f 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return ext2_add_nondir(dentry, inode); } +static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode = ext2_new_inode(dir, mode, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; +} + static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; @@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .tmpfile = ext2_tmpfile, }; const struct inode_operations ext2_special_inode_operations = { diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 87eccbb..f522425 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext3_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir); +static int ext3_dx_readdir(struct file *, struct dir_context *); static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir, return error_msg == NULL ? 1 : 0; } -static int ext3_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext3_readdir(struct file *file, struct dir_context *ctx) { - int error = 0; unsigned long offset; - int i, stored; + int i; struct ext3_dir_entry_2 *de; int err; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - int ret = 0; int dir_has_error = 0; if (is_dx_dir(inode)) { - err = ext3_dx_readdir(filp, dirent, filldir); - if (err != ERR_BAD_DX_DIR) { - ret = err; - goto out; - } + err = ext3_dx_readdir(file, ctx); + if (err != ERR_BAD_DX_DIR) + return err; /* * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL; + EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; } - stored = 0; - offset = filp->f_pos & (sb->s_blocksize - 1); + offset = ctx->pos & (sb->s_blocksize - 1); - while (!error && !stored && filp->f_pos < inode->i_size) { - unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); + while (ctx->pos < inode->i_size) { + unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb); struct buffer_head map_bh; struct buffer_head *bh = NULL; @@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp, if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&filp->f_ra, index)) + if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, - &filp->f_ra, filp, + &file->f_ra, file, index, 1); - filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext3_bread(NULL, inode, blk, 0, &err); } @@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp, if (!dir_has_error) { ext3_error(sb, __func__, "directory #%lu " "contains a hole at offset %lld", - inode->i_ino, filp->f_pos); + inode->i_ino, ctx->pos); dir_has_error = 1; } /* corrupt size? Maybe no more blocks to read */ - if (filp->f_pos > inode->i_blocks << 9) + if (ctx->pos > inode->i_blocks << 9) break; - filp->f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } -revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (filp->f_version != inode->i_version) { + if (offset && file->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext3_dir_entry_2 *) (bh->b_data + i); @@ -177,53 +169,40 @@ revalidate: i += ext3_rec_len_from_disk(de->rec_len); } offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - filp->f_version = inode->i_version; + file->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size + while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); if (!ext3_check_dir_entry ("ext3_readdir", inode, de, bh, offset)) { - /* On error, skip the f_pos to the + /* On error, skip the to the next block. */ - filp->f_pos = (filp->f_pos | + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse (bh); - ret = stored; - goto out; + break; } offset += ext3_rec_len_from_disk(de->rec_len); if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored ++; + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) { + brelse(bh); + return 0; + } } - filp->f_pos += ext3_rec_len_from_disk(de->rec_len); + ctx->pos += ext3_rec_len_from_disk(de->rec_len); } offset = 0; brelse (bh); + if (ctx->pos < inode->i_size) + if (!dir_relax(inode)) + return 0; } -out: - return ret; + return 0; } static inline int is_32bit_api(void) @@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file * filp, void * dirent, - filldir_t filldir, struct fname *fname) +static bool call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) { - struct dir_private_info *info = filp->private_data; - loff_t curr_pos; - struct inode *inode = file_inode(filp); - struct super_block * sb; - int error; - - sb = inode->i_sb; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; if (!fname) { printk("call_filldir: called with null fname?!?\n"); - return 0; + return true; } - curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { - error = filldir(dirent, fname->name, - fname->name_len, curr_pos, + if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, - get_dtype(sb, fname->file_type)); - if (error) { - filp->f_pos = curr_pos; + get_dtype(sb, fname->file_type))) { info->extra_fname = fname; - return error; + return false; } fname = fname->next; } - return 0; + return true; } -static int ext3_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext3_dx_readdir(struct file *file, struct dir_context *ctx) { - struct dir_private_info *info = filp->private_data; - struct inode *inode = file_inode(filp); + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); struct fname *fname; int ret; if (!info) { - info = ext3_htree_create_dir_info(filp, filp->f_pos); + info = ext3_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; - filp->private_data = info; + file->private_data = info; } - if (filp->f_pos == ext3_get_htree_eof(filp)) + if (ctx->pos == ext3_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != filp->f_pos) { + if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp, filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* @@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp, * chain, return them first. */ if (info->extra_fname) { - if (call_filldir(filp, dirent, filldir, info->extra_fname)) + if (!call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; @@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp, * cached entries. */ if ((!info->curr_node) || - (filp->f_version != inode->i_version)) { + (file->f_version != inode->i_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - filp->f_version = inode->i_version; - ret = ext3_htree_fill_tree(filp, info->curr_hash, + file->f_version = inode->i_version; + ret = ext3_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = ext3_get_htree_eof(filp); + ctx->pos = ext3_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); @@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp, fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; - if (call_filldir(filp, dirent, filldir, fname)) + if (!call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); @@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = ext3_get_htree_eof(filp); + ctx->pos = ext3_get_htree_eof(file); break; } info->curr_hash = info->next_hash; @@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp, } } finished: - info->last_pos = filp->f_pos; + info->last_pos = ctx->pos; return 0; } @@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) const struct file_operations ext3_dir_operations = { .llseek = ext3_dir_llseek, .read = generic_read_dir, - .readdir = ext3_readdir, + .iterate = ext3_readdir, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index b31dbd4..1cb9c7e 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext3_sync_file_enter(file, datasync); - if (inode->i_sb->s_flags & MS_RDONLY) + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated state */ + smp_rmb(); + if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS) + return -EROFS; return 0; - + } ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) goto out; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 23c7128..2bd8548 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); } -static void ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); - trace_ext3_invalidatepage(page, offset); + trace_ext3_invalidatepage(page, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - journal_invalidatepage(journal, page, offset); + journal_invalidatepage(journal, page, offset, length); } static int ext3_releasepage(struct page *page, gfp_t wait) @@ -1984,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 692de13..998ea11 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse (bh); - return count; + /* silently ignore the rest of the block */ + break; } ext3fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || @@ -1762,6 +1759,45 @@ retry: return err; } +static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT3_XATTR_TRANS_BLOCKS); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + inode = ext3_new_inode (handle, dir, NULL, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + ext3_set_aops(inode); + err = ext3_orphan_add(handle, inode); + if (err) + goto err_drop_inode; + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + } + ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_drop_inode: + ext3_journal_stop(handle); + unlock_new_inode(inode); + iput(inode); + return err; +} + static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) { handle_t *handle; @@ -2303,7 +2339,7 @@ static int ext3_link (struct dentry * old_dentry, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS); + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2317,6 +2353,11 @@ retry: err = ext3_add_entry(handle, dentry, inode); if (!err) { ext3_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext3_orphan_del(handle, inode); d_instantiate(dentry, inode); } else { drop_nlink(inode); @@ -2519,6 +2560,7 @@ const struct inode_operations ext3_dir_inode_operations = { .mkdir = ext3_mkdir, .rmdir = ext3_rmdir, .mknod = ext3_mknod, + .tmpfile = ext3_tmpfile, .rename = ext3_rename, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6356665..c47f147 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb) if (test_opt (sb, ERRORS_RO)) { ext3_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); sb->s_flags |= MS_RDONLY; } ext3_commit_super(sb, es, 1); @@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function, ext3_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; - sb->s_flags |= MS_RDONLY; set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); + /* + * Make sure updated value of ->s_mount_state will be visible + * before ->s_flags update. + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; + if (EXT3_SB(sb)->s_journal) journal_abort(EXT3_SB(sb)->s_journal, -EIO); } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d0f13ea..5833939 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) static inline int test_root(ext4_group_t a, int b) { - int num = b; - - while (a > num) - num *= b; - return num == a; + while (1) { + if (a < b) + return 0; + if (a == b) + return 1; + if ((a % b) != 0) + return 0; + a = a / b; + } } static int ext4_group_sparse(ext4_group_t group) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f8d56e4..3c7d288 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -29,8 +29,7 @@ #include "ext4.h" #include "xattr.h" -static int ext4_dx_readdir(struct file *filp, - void *dirent, filldir_t filldir); +static int ext4_dx_readdir(struct file *, struct dir_context *); /** * Check if the given dir-inode refers to an htree-indexed directory @@ -103,60 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, return 1; } -static int ext4_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int ext4_readdir(struct file *file, struct dir_context *ctx) { - int error = 0; unsigned int offset; int i, stored; struct ext4_dir_entry_2 *de; int err; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - int ret = 0; int dir_has_error = 0; if (is_dx_dir(inode)) { - err = ext4_dx_readdir(filp, dirent, filldir); + err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { - ret = err; - goto out; + return err; } /* * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - ext4_clear_inode_flag(file_inode(filp), + ext4_clear_inode_flag(file_inode(file), EXT4_INODE_INDEX); } if (ext4_has_inline_data(inode)) { int has_inline_data = 1; - ret = ext4_read_inline_dir(filp, dirent, filldir, + int ret = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) return ret; } stored = 0; - offset = filp->f_pos & (sb->s_blocksize - 1); + offset = ctx->pos & (sb->s_blocksize - 1); - while (!error && !stored && filp->f_pos < inode->i_size) { + while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; struct buffer_head *bh = NULL; - map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&filp->f_ra, index)) + if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, - &filp->f_ra, filp, + &file->f_ra, file, index, 1); - filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); } @@ -166,16 +161,16 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - EXT4_ERROR_FILE(filp, 0, + EXT4_ERROR_FILE(file, 0, "directory contains a " "hole at offset %llu", - (unsigned long long) filp->f_pos); + (unsigned long long) ctx->pos); dir_has_error = 1; } /* corrupt size? Maybe no more blocks to read */ - if (filp->f_pos > inode->i_blocks << 9) + if (ctx->pos > inode->i_blocks << 9) break; - filp->f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } @@ -183,21 +178,20 @@ static int ext4_readdir(struct file *filp, if (!buffer_verified(bh) && !ext4_dirent_csum_verify(inode, (struct ext4_dir_entry *)bh->b_data)) { - EXT4_ERROR_FILE(filp, 0, "directory fails checksum " + EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", - (unsigned long long)filp->f_pos); - filp->f_pos += sb->s_blocksize - offset; + (unsigned long long)ctx->pos); + ctx->pos += sb->s_blocksize - offset; brelse(bh); continue; } set_buffer_verified(bh); -revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ - if (filp->f_version != inode->i_version) { + if (file->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); @@ -214,57 +208,46 @@ revalidate: sb->s_blocksize); } offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; - filp->f_version = inode->i_version; + file->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size + while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, filp, de, bh, + if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* - * On error, skip the f_pos to the next block + * On error, skip to the next block */ - filp->f_pos = (filp->f_pos | + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse(bh); - ret = stored; - goto out; + break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, + if (!dir_emit(ctx, de->name, de->name_len, - filp->f_pos, le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored++; + get_dtype(sb, de->file_type))) { + brelse(bh); + return 0; + } } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = 0; brelse(bh); + if (ctx->pos < inode->i_size) { + if (!dir_relax(inode)) + return 0; + } } -out: - return ret; + return 0; } static inline int is_32bit_api(void) @@ -492,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file *filp, void *dirent, - filldir_t filldir, struct fname *fname) +static int call_filldir(struct file *file, struct dir_context *ctx, + struct fname *fname) { - struct dir_private_info *info = filp->private_data; - loff_t curr_pos; - struct inode *inode = file_inode(filp); - struct super_block *sb; - int error; - - sb = inode->i_sb; + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " @@ -509,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent, inode->i_ino, current->comm); return 0; } - curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); + ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { - error = filldir(dirent, fname->name, - fname->name_len, curr_pos, + if (!dir_emit(ctx, fname->name, + fname->name_len, fname->inode, - get_dtype(sb, fname->file_type)); - if (error) { - filp->f_pos = curr_pos; + get_dtype(sb, fname->file_type))) { info->extra_fname = fname; - return error; + return 1; } fname = fname->next; } return 0; } -static int ext4_dx_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { - struct dir_private_info *info = filp->private_data; - struct inode *inode = file_inode(filp); + struct dir_private_info *info = file->private_data; + struct inode *inode = file_inode(file); struct fname *fname; int ret; if (!info) { - info = ext4_htree_create_dir_info(filp, filp->f_pos); + info = ext4_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; - filp->private_data = info; + file->private_data = info; } - if (filp->f_pos == ext4_get_htree_eof(filp)) + if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != filp->f_pos) { + if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp, filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); + info->curr_hash = pos2maj_hash(file, ctx->pos); + info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* @@ -557,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp, * chain, return them first. */ if (info->extra_fname) { - if (call_filldir(filp, dirent, filldir, info->extra_fname)) + if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; @@ -571,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp, * cached entries. */ if ((!info->curr_node) || - (filp->f_version != inode->i_version)) { + (file->f_version != inode->i_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); - filp->f_version = inode->i_version; - ret = ext4_htree_fill_tree(filp, info->curr_hash, + file->f_version = inode->i_version; + ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = ext4_get_htree_eof(filp); + ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); @@ -590,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp, fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; - if (call_filldir(filp, dirent, filldir, fname)) + if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); @@ -601,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = ext4_get_htree_eof(filp); + ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; @@ -609,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp, } } finished: - info->last_pos = filp->f_pos; + info->last_pos = ctx->pos; return 0; } @@ -624,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp) const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, - .readdir = ext4_readdir, + .iterate = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0aabb34..b577e45 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -177,33 +177,22 @@ struct ext4_map_blocks { }; /* - * For delayed allocation tracking - */ -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - -/* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 -#define EXT4_IO_END_ERROR 0x0002 -#define EXT4_IO_END_DIRECT 0x0004 +#define EXT4_IO_END_DIRECT 0x0002 /* - * For converting uninitialized extents on a work queue. + * For converting uninitialized extents on a work queue. 'handle' is used for + * buffered writeback. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ + handle_t *handle; /* handle reserved for extent + * conversion */ struct inode *inode; /* file being written to */ + struct bio *bio; /* Linked list of completed + * bios covering the extent */ unsigned int flag; /* unwritten or not */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ @@ -582,11 +571,6 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 /* - * Flags used by ext4_discard_partial_page_buffers - */ -#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 - -/* * ioctl commands */ #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS @@ -880,6 +864,7 @@ struct ext4_inode_info { rwlock_t i_es_lock; struct list_head i_es_lru; unsigned int i_es_lru_nr; /* protected by i_es_lock */ + unsigned long i_touch_when; /* jiffies of last accessing */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -904,12 +889,22 @@ struct ext4_inode_info { qsize_t i_reserved_quota; #endif - /* completed IOs that might need unwritten extents handling */ - struct list_head i_completed_io_list; + /* Lock protecting lists below */ spinlock_t i_completed_io_lock; + /* + * Completed IOs that need unwritten extents handling and have + * transaction reserved + */ + struct list_head i_rsv_conversion_list; + /* + * Completed IOs that need unwritten extents handling and don't have + * transaction reserved + */ + struct list_head i_unrsv_conversion_list; atomic_t i_ioend_count; /* Number of outstanding io_end structs */ atomic_t i_unwritten; /* Nr. of inflight conversions pending */ - struct work_struct i_unwritten_work; /* deferred extent conversion */ + struct work_struct i_rsv_conversion_work; + struct work_struct i_unrsv_conversion_work; spinlock_t i_block_reservation_lock; @@ -1246,7 +1241,6 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; - unsigned int s_max_writeback_mb_bump; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; @@ -1282,8 +1276,10 @@ struct ext4_sb_info { struct flex_groups *s_flex_groups; ext4_group_t s_flex_groups_allocated; - /* workqueue for dio unwritten */ - struct workqueue_struct *dio_unwritten_wq; + /* workqueue for unreserved extent convertions (dio) */ + struct workqueue_struct *unrsv_conversion_wq; + /* workqueue for reserved extent conversions (buffered io) */ + struct workqueue_struct *rsv_conversion_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; @@ -1308,6 +1304,7 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_lru; + unsigned long s_es_last_sorted; struct percpu_counter s_extent_cache_cnt; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; }; @@ -1343,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + /* Writeback has to have coversion transaction reserved */ + WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle && + !(io_end->flag & EXT4_IO_END_DIRECT)); io_end->flag |= EXT4_IO_END_UNWRITTEN; atomic_inc(&EXT4_I(inode)->i_unwritten); } @@ -2000,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype) /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_unwritten_io(struct inode *); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct @@ -2089,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); -extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); @@ -2097,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags); +extern int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from); +extern int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length); +extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, @@ -2112,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); -extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t first, ext4_lblk_t stop); @@ -2167,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); + extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ - __LINE__, ## message) extern __printf(5, 6) -void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, +void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern __printf(5, 6) -void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, +void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern __printf(4, 5) void __ext4_abort(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ - __LINE__, ## message) extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); -#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ - __LINE__, ## message) extern __printf(3, 4) -void ext4_msg(struct super_block *, const char *, const char *, ...); +void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); -#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ - __LINE__, msg) extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, unsigned long, ext4_fsblk_t, const char *, ...); -#define ext4_grp_locked_error(sb, grp, message...) \ - __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) + +#ifdef CONFIG_PRINTK + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ + __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error_file(file, func, line, block, fmt, ...) \ + __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) +#define ext4_error(sb, fmt, ...) \ + __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_abort(sb, fmt, ...) \ + __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning(sb, fmt, ...) \ + __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_msg(sb, level, fmt, ...) \ + __ext4_msg(sb, level, fmt, ##__VA_ARGS__) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ + __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ + fmt, ##__VA_ARGS__) + +#else + +#define ext4_error_inode(inode, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, " "); \ +} while (0) +#define ext4_error_file(file, func, line, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_file(file, "", 0, block, " "); \ +} while (0) +#define ext4_error(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, " "); \ +} while (0) +#define ext4_abort(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_abort(sb, "", 0, " "); \ +} while (0) +#define ext4_warning(sb, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning(sb, "", 0, " "); \ +} while (0) +#define ext4_msg(sb, level, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_msg(sb, "", " "); \ +} while (0) +#define dump_mmp_msg(sb, mmp, msg) \ + __dump_mmp_msg(sb, mmp, "", 0, "") +#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ +} while (0) + +#endif + extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -2313,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, { struct ext4_group_info ***grp_info; long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); grp_info = EXT4_SB(sb)->s_group_info; indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); @@ -2516,7 +2573,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); extern int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, + struct dir_context *ctx, int *has_inline_data); extern int htree_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, @@ -2599,8 +2656,7 @@ struct ext4_extent; extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); -extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, - int chunk); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern void ext4_ext_truncate(handle_t *, struct inode *); @@ -2610,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); -extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len); +extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, @@ -2652,14 +2708,14 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); -extern void ext4_ioend_shutdown(struct inode *); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); extern int ext4_put_io_end(ext4_io_end_t *io_end); extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); -extern void ext4_end_io_work(struct work_struct *work); +extern void ext4_end_io_rsv_work(struct work_struct *work); +extern void ext4_end_io_unrsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, @@ -2672,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); extern int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp); -/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +/* + * Note that these flags will never ever appear in a buffer_head's state flag. + * See EXT4_MAP_... to see where this is used. + */ enum ext4_state_bits { BH_Uninit /* blocks are allocated but uninitialized on disk */ - = BH_JBDPrivateStart, + = BH_JBDPrivateStart, BH_AllocFromCluster, /* allocated blocks were part of already - * allocated cluster. Note that this flag will - * never, ever appear in a buffer_head's state - * flag. See EXT4_MAP_FROM_CLUSTER to see where - * this is used. */ + * allocated cluster. */ }; -BUFFER_FNS(Uninit, uninit) -TAS_BUFFER_FNS(Uninit, uninit) - /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 451eb40..72a3600 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle) /* * Wrappers for jbd2_journal_start/end. */ -handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int nblocks) +static int ext4_journal_check_start(struct super_block *sb) { journal_t *journal; might_sleep(); - - trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) - return ERR_PTR(-EROFS); - + return -EROFS; WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; - if (!journal) - return ext4_get_nojournal(); /* * Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ - if (is_journal_aborted(journal)) { + if (journal && is_journal_aborted(journal)) { ext4_abort(sb, "Detected aborted journal"); - return ERR_PTR(-EROFS); + return -EROFS; } - return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); + return 0; +} + +handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, + int type, int blocks, int rsv_blocks) +{ + journal_t *journal; + int err; + + trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) + return ERR_PTR(err); + + journal = EXT4_SB(sb)->s_journal; + if (!journal) + return ext4_get_nojournal(); + return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, + type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) @@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) return err; } +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type) +{ + struct super_block *sb; + int err; + + if (!ext4_handle_valid(handle)) + return ext4_get_nojournal(); + + sb = handle->h_journal->j_private; + trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, + _RET_IP_); + err = ext4_journal_check_start(sb); + if (err < 0) { + jbd2_journal_free_reserved(handle); + return ERR_PTR(err); + } + + err = jbd2_journal_start_reserved(handle, type, line); + if (err < 0) + return ERR_PTR(err); + return handle; +} + void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index c8c6885..2877258 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode) #define EXT4_HT_MIGRATE 8 #define EXT4_HT_MOVE_EXTENTS 9 #define EXT4_HT_XATTR 10 -#define EXT4_HT_MAX 11 +#define EXT4_HT_EXT_CONVERT 11 +#define EXT4_HT_MAX 12 /** * struct ext4_journal_cb_entry - Base structure for callback information. @@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int nblocks); + int type, int blocks, int rsv_blocks); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) #define ext4_journal_start(inode, type, nblocks) \ - __ext4_journal_start((inode), __LINE__, (type), (nblocks)) + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, - int nblocks) + int blocks, int rsv_blocks) { - return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); + return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, + rsv_blocks); } #define ext4_journal_stop(handle) \ __ext4_journal_stop(__func__, __LINE__, (handle)) +#define ext4_journal_start_reserved(handle, type) \ + __ext4_journal_start_reserved((handle), __LINE__, (type)) + +handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, + int type); + +static inline void ext4_journal_free_reserved(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_free_reserved(handle); +} + static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 107936d..7097b0f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode, next_del = ext4_find_delayed_extent(inode, &es); if (!exists && next_del) { exists = 1; - flags |= FIEMAP_EXTENT_DELALLOC; + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); } up_read(&EXT4_I(inode)->i_data_sem); @@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, } /* - * How many index/leaf blocks need to change/allocate to modify nrblocks? + * How many index/leaf blocks need to change/allocate to add @extents extents? * - * if nrblocks are fit in a single extent (chunk flag is 1), then - * in the worse case, each tree level index/leaf need to be changed - * if the tree split due to insert a new extent, then the old tree - * index/leaf need to be updated too + * If we add a single extent, then in the worse case, each tree level + * index/leaf need to be changed in case of the tree split. * - * If the nrblocks are discontiguous, they could cause - * the whole tree split more than once, but this is really rare. + * If more extents are inserted, they could cause the whole tree split more + * than once, but this is really rare. */ -int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; int depth; @@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) depth = ext_depth(inode); - if (chunk) + if (extents <= 1) index = depth * 2; else index = depth * 3; @@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) return index; } +static inline int get_default_free_blocks_flags(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; + else if (ext4_should_journal_data(inode)) + return EXT4_FREE_BLOCKS_FORGET; + return 0; +} + static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, - ext4_fsblk_t *partial_cluster, + long long *partial_cluster, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; - int flags = 0; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; - else if (ext4_should_journal_data(inode)) - flags |= EXT4_FREE_BLOCKS_FORGET; + int flags = get_default_free_blocks_flags(inode); /* * For bigalloc file systems, we never free a partial cluster @@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * partial cluster here. */ pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + if ((*partial_cluster > 0) && + (EXT4_B2C(sbi, pblk) != *partial_cluster)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, flags); @@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; + unsigned int unaligned; num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; - ext_debug("free last %u blocks starting %llu\n", num, pblk); + /* + * Usually we want to free partial cluster at the end of the + * extent, except for the situation when the cluster is still + * used by any other extent (partial_cluster is negative). + */ + if (*partial_cluster < 0 && + -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) + flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; + + ext_debug("free last %u blocks starting %llu partial %lld\n", + num, pblk, *partial_cluster); ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* * If the block range to be freed didn't start at the * beginning of a cluster, and we removed the entire - * extent, save the partial cluster here, since we - * might need to delete if we determine that the - * truncate operation has removed all of the blocks in - * the cluster. + * extent and the cluster is not used by any other extent, + * save the partial cluster here, since we might need to + * delete if we determine that the truncate operation has + * removed all of the blocks in the cluster. + * + * On the other hand, if we did not manage to free the whole + * extent, we have to mark the cluster as used (store negative + * cluster number in partial_cluster). */ - if (pblk & (sbi->s_cluster_ratio - 1) && - (ee_len == num)) + unaligned = pblk & (sbi->s_cluster_ratio - 1); + if (unaligned && (ee_len == num) && + (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) *partial_cluster = EXT4_B2C(sbi, pblk); - else + else if (unaligned) + *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); + else if (*partial_cluster > 0) *partial_cluster = 0; - } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* head removal */ - ext4_lblk_t num; - ext4_fsblk_t start; - - num = to - from; - start = ext4_ext_pblock(ex); - - ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, NULL, start, num, flags); - - } else { - printk(KERN_INFO "strange request: removal(2) " - "%u-%u from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), ee_len); - } + } else + ext4_error(sbi->s_sb, "strange request: removal(2) " + "%u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } @@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf + * @partial_cluster: The cluster which we'll have to free if all extents + * has been released from it. It gets negative in case + * that the cluster is still used. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, + struct ext4_ext_path *path, + long long *partial_cluster, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned short ex_ee_len; unsigned uninitialized = 0; struct ext4_extent *ex; + ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug("truncate since %u in leaf to %u\n", start, end); @@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, return -EIO; } /* find where to start removing */ - ex = EXT_LAST_EXTENT(eh); + ex = path[depth].p_ext; + if (!ex) + ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { + /* + * We're going to skip this extent and move to another, + * so if this extent is not cluster aligned we have + * to mark the current cluster as used to avoid + * accidentally freeing it later on + */ + pblk = ext4_ext_pblock(ex); + if (pblk & (sbi->s_cluster_ratio - 1)) + *partial_cluster = + -((long long)EXT4_B2C(sbi, pblk)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } else + } else if (*partial_cluster > 0) *partial_cluster = 0; err = ext4_ext_dirty(handle, inode, path + depth); @@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, err = ext4_ext_correct_indexes(handle, inode, path); /* - * If there is still a entry in the leaf node, check to see if - * it references the partial cluster. This is the only place - * where it could; if it doesn't, we can free the cluster. + * Free the partial cluster only if the current extent does not + * reference it. Otherwise we might free used cluster. */ - if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && + if (*partial_cluster > 0 && (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != *partial_cluster)) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + int flags = get_default_free_blocks_flags(inode); ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), @@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; - ext4_fsblk_t partial_cluster = 0; + long long partial_cluster = 0; handle_t *handle; int i = 0, err = 0; @@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, return PTR_ERR(handle); again: - trace_ext4_ext_remove_space(inode, start, depth); + trace_ext4_ext_remove_space(inode, start, end, depth); /* * Check if we are removing extents inside the extent tree. If that @@ -2844,17 +2866,14 @@ again: } } - trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, - path->p_hdr->eh_entries); + trace_ext4_ext_remove_space_done(inode, start, end, depth, + partial_cluster, path->p_hdr->eh_entries); /* If we still have something in the partial cluster and we have removed * even the first extent, then we should free the blocks in the partial * cluster as well. */ - if (partial_cluster && path->p_hdr->eh_entries == 0) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { + int flags = get_default_free_blocks_flags(inode); ext4_free_blocks(handle, inode, NULL, EXT4_C2B(EXT4_SB(sb), partial_cluster), @@ -3642,7 +3661,7 @@ int ext4_find_delalloc_range(struct inode *inode, { struct extent_status es; - ext4_es_find_delayed_extent(inode, lblk_start, &es); + ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); if (es.es_len == 0) return 0; /* there is no delay extent in this tree */ else if (es.es_lblk <= lblk_start && @@ -4363,7 +4382,7 @@ out2: } out3: - trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); + trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); return err ? err : allocated; } @@ -4446,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - return ext4_punch_hole(file, offset, len); + return ext4_punch_hole(inode, offset, len); ret = ext4_convert_inline_data(inode); if (ret) @@ -4548,10 +4567,9 @@ retry: * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ -int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len) +int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) { - handle_t *handle; unsigned int max_blocks; int ret = 0; int ret2 = 0; @@ -4566,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - map.m_lblk); /* - * credits to insert 1 extent into extent tree + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + credits = 0; + } else { + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); @@ -4586,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - if (ret <= 0 || ret2 ) + if (credits) + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2) break; } + if (!credits) + ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } @@ -4608,9 +4645,10 @@ static int ext4_find_delayed_extent(struct inode *inode, struct extent_status es; ext4_lblk_t block, next_del; - ext4_es_find_delayed_extent(inode, newes->es_lblk, &es); - if (newes->es_pblk == 0) { + ext4_es_find_delayed_extent_range(inode, newes->es_lblk, + newes->es_lblk + newes->es_len - 1, &es); + /* * No extent in extent-tree contains block @newes->es_pblk, * then the block may stay in 1)a hole or 2)delayed-extent. @@ -4630,7 +4668,7 @@ static int ext4_find_delayed_extent(struct inode *inode, } block = newes->es_lblk + newes->es_len; - ext4_es_find_delayed_extent(inode, block, &es); + ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); if (es.es_len == 0) next_del = EXT_MAX_BLOCKS; else @@ -4658,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode, error = ext4_get_inode_loc(inode, &iloc); if (error) return error; - physical = iloc.bh->b_blocknr << blockbits; + physical = (__u64)iloc.bh->b_blocknr << blockbits; offset = EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize; physical += offset; @@ -4666,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode, flags |= FIEMAP_EXTENT_DATA_INLINE; brelse(iloc.bh); } else { /* external block */ - physical = EXT4_I(inode)->i_file_acl << blockbits; + physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index fe3337a..ee018d5 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -10,6 +10,7 @@ * Ext4 extents status tree core functions. */ #include <linux/rbtree.h> +#include <linux/list_sort.h> #include "ext4.h" #include "extents_status.h" #include "ext4_extents.h" @@ -232,14 +233,16 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk - * if it exists, otherwise, the next extent after @es->lblk. + * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering + * @es->lblk if it exists, otherwise, the next extent after @es->lblk. * * @inode: the inode which owns delayed extents * @lblk: the offset where we start to search + * @end: the offset where we stop to search * @es: delayed extent that we found */ -void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, +void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { struct ext4_es_tree *tree = NULL; @@ -247,7 +250,8 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct rb_node *node; BUG_ON(es == NULL); - trace_ext4_es_find_delayed_extent_enter(inode, lblk); + BUG_ON(end < lblk); + trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; @@ -270,6 +274,10 @@ out: if (es1 && !ext4_es_is_delayed(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->es_lblk > end) { + es1 = NULL; + break; + } if (ext4_es_is_delayed(es1)) break; } @@ -284,8 +292,7 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); - trace_ext4_es_find_delayed_extent_exit(inode, es); + trace_ext4_es_find_delayed_extent_range_exit(inode, es); } static struct extent_status * @@ -665,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, error: write_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); ext4_es_print_tree(inode); return err; @@ -727,7 +733,6 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); - ext4_es_lru_add(inode); trace_ext4_es_lookup_extent_exit(inode, es, found); return found; } @@ -871,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) EXTENT_STATUS_WRITTEN); } +static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, + struct list_head *b) +{ + struct ext4_inode_info *eia, *eib; + eia = list_entry(a, struct ext4_inode_info, i_es_lru); + eib = list_entry(b, struct ext4_inode_info, i_es_lru); + + if (eia->i_touch_when == eib->i_touch_when) + return 0; + if (time_after(eia->i_touch_when, eib->i_touch_when)) + return 1; + else + return -1; +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); struct ext4_inode_info *ei; - struct list_head *cur, *tmp, scanned; + struct list_head *cur, *tmp; + LIST_HEAD(skiped); int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk = 0; @@ -886,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) if (!nr_to_scan) return ret; - INIT_LIST_HEAD(&scanned); - spin_lock(&sbi->s_es_lru_lock); + + /* + * If the inode that is at the head of LRU list is newer than + * last_sorted time, that means that we need to sort this list. + */ + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); + if (sbi->s_es_last_sorted < ei->i_touch_when) { + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); + sbi->s_es_last_sorted = jiffies; + } + list_for_each_safe(cur, tmp, &sbi->s_es_lru) { - list_move_tail(cur, &scanned); + /* + * If we have already reclaimed all extents from extent + * status tree, just stop the loop immediately. + */ + if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) + break; ei = list_entry(cur, struct ext4_inode_info, i_es_lru); - read_lock(&ei->i_es_lock); - if (ei->i_es_lru_nr == 0) { - read_unlock(&ei->i_es_lock); + /* Skip the inode that is newer than the last_sorted time */ + if (sbi->s_es_last_sorted < ei->i_touch_when) { + list_move_tail(cur, &skiped); continue; } - read_unlock(&ei->i_es_lock); + + if (ei->i_es_lru_nr == 0) + continue; write_lock(&ei->i_es_lock); ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + if (ei->i_es_lru_nr == 0) + list_del_init(&ei->i_es_lru); write_unlock(&ei->i_es_lock); nr_shrunk += ret; @@ -910,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) if (nr_to_scan == 0) break; } - list_splice_tail(&scanned, &sbi->s_es_lru); + + /* Move the newer inodes into the tail of the LRU list. */ + list_splice_tail(&skiped, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); @@ -918,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) return ret; } -void ext4_es_register_shrinker(struct super_block *sb) +void ext4_es_register_shrinker(struct ext4_sb_info *sbi) { - struct ext4_sb_info *sbi; - - sbi = EXT4_SB(sb); INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); + sbi->s_es_last_sorted = 0; sbi->s_es_shrinker.shrink = ext4_es_shrink; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&sbi->s_es_shrinker); } -void ext4_es_unregister_shrinker(struct super_block *sb) +void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { - unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); + unregister_shrinker(&sbi->s_es_shrinker); } void ext4_es_lru_add(struct inode *inode) @@ -940,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ei->i_touch_when = jiffies; + + if (!list_empty(&ei->i_es_lru)) + return; + spin_lock(&sbi->s_es_lru_lock); if (list_empty(&ei->i_es_lru)) list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); - else - list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); spin_unlock(&sbi->s_es_lru_lock); } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index d8e2d4d..e936730 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -39,6 +39,7 @@ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +struct ext4_sb_info; struct ext4_extent; struct extent_status { @@ -62,7 +63,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, unsigned long long status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, +extern void ext4_es_find_delayed_extent_range(struct inode *inode, + ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); @@ -118,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es, es->es_pblk = block; } -extern void ext4_es_register_shrinker(struct super_block *sb); -extern void ext4_es_unregister_shrinker(struct super_block *sb); +extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); extern void ext4_es_lru_del(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4959e29..6f4cc56 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (map->m_lblk + map->m_len) << blkbits; + endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { if (last != start) - dataoff = last << blkbits; + dataoff = (loff_t)last << blkbits; break; } @@ -465,10 +465,10 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * it will be as a data. */ - ext4_es_find_delayed_extent(inode, last, &es); + ext4_es_find_delayed_extent_range(inode, last, last, &es); if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (last != start) - dataoff = last << blkbits; + dataoff = (loff_t)last << blkbits; break; } @@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) } last++; - dataoff = last << blkbits; + dataoff = (loff_t)last << blkbits; } while (last <= end); mutex_unlock(&inode->i_mutex); @@ -494,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) if (dataoff > isize) return -ENXIO; - if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - return -EINVAL; - if (dataoff > maxsize) - return -EINVAL; - - if (dataoff != file->f_pos) { - file->f_pos = dataoff; - file->f_version = 0; - } - - return dataoff; + return vfs_setpos(file, dataoff, maxsize); } /* @@ -540,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { last += ret; - holeoff = last << blkbits; + holeoff = (loff_t)last << blkbits; continue; } @@ -548,10 +538,10 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * we will skip this extent. */ - ext4_es_find_delayed_extent(inode, last, &es); + ext4_es_find_delayed_extent_range(inode, last, last, &es); if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { last = es.es_lblk + es.es_len; - holeoff = last << blkbits; + holeoff = (loff_t)last << blkbits; continue; } @@ -566,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) &map, &holeoff); if (!unwritten) { last += ret; - holeoff = last << blkbits; + holeoff = (loff_t)last << blkbits; continue; } } @@ -580,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) if (holeoff > isize) holeoff = isize; - if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - return -EINVAL; - if (holeoff > maxsize) - return -EINVAL; - - if (holeoff != file->f_pos) { - file->f_pos = holeoff; - file->f_version = 0; - } - - return holeoff; + return vfs_setpos(file, holeoff, maxsize); } /* diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e0ba8a4..a8bc47f 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode) return ret; } -/** - * __sync_file - generic_file_fsync without the locking and filemap_write - * @inode: inode to sync - * @datasync: only sync essential metadata if true - * - * This is just generic_file_fsync without the locking. This is needed for - * nojournal mode to make sure this inodes data/metadata makes it to disk - * properly. The i_mutex should be held already. - */ -static int __sync_inode(struct inode *inode, int datasync) -{ - int err; - int ret; - - ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return ret; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; - - err = sync_inode_metadata(inode, 1); - if (ret == 0) - ret = err; - return ret; -} - /* * akpm: A new design for ext4_sync_file(). * @@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret, err; + int ret = 0, err; tid_t commit_tid; bool needs_barrier = false; @@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - if (inode->i_sb->s_flags & MS_RDONLY) - goto out; - - ret = ext4_flush_unwritten_io(inode); - if (ret < 0) + if (inode->i_sb->s_flags & MS_RDONLY) { + /* Make sure that we read updated s_mount_flags value */ + smp_rmb(); + if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + ret = -EROFS; goto out; + } if (!journal) { - ret = __sync_inode(inode, datasync); + ret = generic_file_fsync(file, start, end, datasync); if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); goto out; } + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!ret) ret = err; } - out: - mutex_unlock(&inode->i_mutex); +out: trace_ext4_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 00a818d..f03598c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -747,7 +747,8 @@ repeat_in_this_group: if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks); + handle_type, nblocks, + 0); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index b8d5d35..87b30cd 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -624,7 +624,7 @@ cleanup: partial--; } out: - trace_ext4_ind_map_blocks_exit(inode, map, err); + trace_ext4_ind_map_blocks_exit(inode, flags, map, err); return err; } @@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, retry: if (rw == READ && ext4_should_dioread_nolock(inode)) { - if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { - mutex_lock(&inode->i_mutex); - ext4_flush_unwritten_io(inode); - mutex_unlock(&inode->i_mutex); - } /* * Nolock dioread optimization may be dynamically disabled * via ext4_inode_block_unlocked_dio(). Check inode's state @@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; } -int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +/* + * Calculate number of indirect blocks touched by mapping @nrblocks logically + * contiguous blocks + */ +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) { - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block */ - indirects = nrblocks * 2 + 1; - return indirects; + return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } /* @@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *last) { __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; + flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; + else if (ext4_should_journal_data(inode)) + flags |= EXT4_FREE_BLOCKS_FORGET; if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, count)) { diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3e2bf87..d9ecbf1 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, entry = (struct ext4_xattr_entry *) ((void *)raw_inode + EXT4_I(inode)->i_inline_off); - free += le32_to_cpu(entry->e_value_size); + free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); goto out; } @@ -1404,16 +1404,15 @@ out: * offset as if '.' and '..' really take place. * */ -int ext4_read_inline_dir(struct file *filp, - void *dirent, filldir_t filldir, +int ext4_read_inline_dir(struct file *file, + struct dir_context *ctx, int *has_inline_data) { - int error = 0; unsigned int offset, parent_ino; - int i, stored; + int i; struct ext4_dir_entry_2 *de; struct super_block *sb; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); int ret, inline_size = 0; struct ext4_iloc iloc; void *dir_buf = NULL; @@ -1444,9 +1443,8 @@ int ext4_read_inline_dir(struct file *filp, goto out; sb = inode->i_sb; - stored = 0; parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); - offset = filp->f_pos; + offset = ctx->pos; /* * dotdot_offset and dotdot_size is the real offset and @@ -1460,104 +1458,74 @@ int ext4_read_inline_dir(struct file *filp, extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; - while (!error && !stored && filp->f_pos < extra_size) { -revalidate: - /* - * If the version has changed since the last call to - * readdir(2), then we might be pointing to an invalid - * dirent right now. Scan from the start of the inline - * dir to make sure. - */ - if (filp->f_version != inode->i_version) { - for (i = 0; i < extra_size && i < offset;) { - /* - * "." is with offset 0 and - * ".." is dotdot_offset. - */ - if (!i) { - i = dotdot_offset; - continue; - } else if (i == dotdot_offset) { - i = dotdot_size; - continue; - } - /* for other entry, the real offset in - * the buf has to be tuned accordingly. - */ - de = (struct ext4_dir_entry_2 *) - (dir_buf + i - extra_offset); - /* It's too expensive to do a full - * dirent test each time round this - * loop, but we do have to test at - * least that it is non-zero. A - * failure will be detected in the - * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len, - extra_size) < EXT4_DIR_REC_LEN(1)) - break; - i += ext4_rec_len_from_disk(de->rec_len, - extra_size); - } - offset = i; - filp->f_pos = offset; - filp->f_version = inode->i_version; - } - - while (!error && filp->f_pos < extra_size) { - if (filp->f_pos == 0) { - error = filldir(dirent, ".", 1, 0, inode->i_ino, - DT_DIR); - if (error) - break; - stored++; - filp->f_pos = dotdot_offset; + /* + * If the version has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the inline + * dir to make sure. + */ + if (file->f_version != inode->i_version) { + for (i = 0; i < extra_size && i < offset;) { + /* + * "." is with offset 0 and + * ".." is dotdot_offset. + */ + if (!i) { + i = dotdot_offset; + continue; + } else if (i == dotdot_offset) { + i = dotdot_size; continue; } + /* for other entry, the real offset in + * the buf has to be tuned accordingly. + */ + de = (struct ext4_dir_entry_2 *) + (dir_buf + i - extra_offset); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, extra_size) + < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + extra_size); + } + offset = i; + ctx->pos = offset; + file->f_version = inode->i_version; + } - if (filp->f_pos == dotdot_offset) { - error = filldir(dirent, "..", 2, - dotdot_offset, - parent_ino, DT_DIR); - if (error) - break; - stored++; + while (ctx->pos < extra_size) { + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_offset; + continue; + } - filp->f_pos = dotdot_size; - continue; - } + if (ctx->pos == dotdot_offset) { + if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR)) + goto out; + ctx->pos = dotdot_size; + continue; + } - de = (struct ext4_dir_entry_2 *) - (dir_buf + filp->f_pos - extra_offset); - if (ext4_check_dir_entry(inode, filp, de, - iloc.bh, dir_buf, - extra_size, filp->f_pos)) { - ret = stored; + de = (struct ext4_dir_entry_2 *) + (dir_buf + ctx->pos - extra_offset); + if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf, + extra_size, ctx->pos)) + goto out; + if (le32_to_cpu(de->inode)) { + if (!dir_emit(ctx, de->name, de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) goto out; - } - if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored++; - } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len, - extra_size); } + ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size); } out: kfree(dir_buf); @@ -1842,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode, if (error) goto out; - physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); length = i_size_read(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0723774..0188e65 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static void ext4_invalidatepage(struct page *page, unsigned long offset); +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); /* * Test whether an inode is a fast symlink. @@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode) filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); - ext4_ioend_shutdown(inode); + + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; } @@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); - ext4_ioend_shutdown(inode); + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); if (is_bad_inode(inode)) goto no_delete; @@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func, #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) -/* - * Return the number of contiguous dirty pages in a given inode - * starting at page frame idx. - */ -static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, - unsigned int max_pages) -{ - struct address_space *mapping = inode->i_mapping; - pgoff_t index; - struct pagevec pvec; - pgoff_t num = 0; - int i, nr_pages, done = 0; - - if (max_pages == 0) - return 0; - pagevec_init(&pvec, 0); - while (!done) { - index = idx; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - (pgoff_t)PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - lock_page(page); - if (unlikely(page->mapping != mapping) || - !PageDirty(page) || - PageWriteback(page) || - page->index != idx) { - done = 1; - unlock_page(page); - break; - } - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - if (!buffer_delay(bh) && - !buffer_unwritten(bh)) - done = 1; - bh = bh->b_this_page; - } while (!done && (bh != head)); - } - unlock_page(page); - if (done) - break; - idx++; - num++; - if (num >= max_pages) { - done = 1; - break; - } - } - pagevec_release(&pvec); - } - return num; -} - #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, @@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + ext4_es_lru_add(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { @@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file, } } - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) + goto errout; + copied = ret; + } else copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file, if (i_size_changed) ext4_mark_inode_dirty(handle, inode); - if (copied < 0) - ret = copied; if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free) } static void ext4_da_page_release_reservation(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { int to_release = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int stop = offset + length; int num_clusters; ext4_fsblk_t lblk; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + head = page_buffers(page); bh = head; do { unsigned int next_off = curr_off + bh->b_size; + if (next_off > stop) + break; + if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); @@ -1460,145 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page, * Delayed allocation stuff */ -/* - * mpage_da_submit_io - walks through extent of pages and try to write - * them with writepage() call back - * - * @mpd->inode: inode - * @mpd->first_page: first page of the extent - * @mpd->next_page: page after the last page of the extent - * - * By the time mpage_da_submit_io() is called we expect all blocks - * to be allocated. this may be wrong if allocation failed. - * - * As pages are already locked by write_cache_pages(), we can't use it - */ -static int mpage_da_submit_io(struct mpage_da_data *mpd, - struct ext4_map_blocks *map) -{ - struct pagevec pvec; - unsigned long index, end; - int ret = 0, err, nr_pages, i; - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - loff_t size = i_size_read(inode); - unsigned int len, block_start; - struct buffer_head *bh, *page_bufs = NULL; - sector_t pblock = 0, cur_logical = 0; - struct ext4_io_submit io_submit; +struct mpage_da_data { + struct inode *inode; + struct writeback_control *wbc; - BUG_ON(mpd->next_page <= mpd->first_page); - ext4_io_submit_init(&io_submit, mpd->wbc); - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_submit.io_end) - return -ENOMEM; + pgoff_t first_page; /* The first page to write */ + pgoff_t next_page; /* Current page to examine */ + pgoff_t last_page; /* Last page to examine */ /* - * We need to start from the first_page to the next_page - 1 - * to make sure we also write the mapped dirty buffer_heads. - * If we look at mpd->b_blocknr we would only be looking - * at the currently mapped buffer_heads. + * Extent to map - this can be after first_page because that can be + * fully mapped. We somewhat abuse m_flags to store whether the extent + * is delalloc or unwritten. */ - index = mpd->first_page; - end = mpd->next_page - 1; - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - int skip_page = 0; - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - - if (index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - if (map) { - cur_logical = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - pblock = map->m_pblk + (cur_logical - - map->m_lblk); - } - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - bh = page_bufs = page_buffers(page); - block_start = 0; - do { - if (map && (cur_logical >= map->m_lblk) && - (cur_logical <= (map->m_lblk + - (map->m_len - 1)))) { - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock; - } - if (buffer_unwritten(bh) || - buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - if (map->m_flags & EXT4_MAP_UNINIT) - set_buffer_uninit(bh); - clear_buffer_unwritten(bh); - } - - /* - * skip page if block allocation undone and - * block is dirty - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) - skip_page = 1; - bh = bh->b_this_page; - block_start += bh->b_size; - cur_logical++; - pblock++; - } while (bh != page_bufs); - - if (skip_page) { - unlock_page(page); - continue; - } - - clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&io_submit, page, len, - mpd->wbc); - if (!err) - mpd->pages_written++; - /* - * In error case, we have to continue because - * remaining pages are still locked - */ - if (ret == 0) - ret = err; - } - pagevec_release(&pvec); - } - ext4_io_submit(&io_submit); - /* Drop io_end reference we got from init */ - ext4_put_io_end_defer(io_submit.io_end); - return ret; -} + struct ext4_map_blocks map; + struct ext4_io_submit io_submit; /* IO submission data */ +}; -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) +static void mpage_release_unused_pages(struct mpage_da_data *mpd, + bool invalidate) { int nr_pages, i; pgoff_t index, end; struct pagevec pvec; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t start, last; + + /* This is necessary when next_page == 0. */ + if (mpd->first_page >= mpd->next_page) + return; index = mpd->first_page; end = mpd->next_page - 1; - - start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); - ext4_es_remove_extent(inode, start, last - start + 1); + if (invalidate) { + ext4_lblk_t start, last; + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, start, last - start + 1); + } pagevec_init(&pvec, 0); while (index <= end) { @@ -1611,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) break; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - block_invalidatepage(page, 0); - ClearPageUptodate(page); + if (invalidate) { + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + ClearPageUptodate(page); + } unlock_page(page); } index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } - return; } static void ext4_print_free_blocks(struct inode *inode) @@ -1647,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -/* - * mpage_da_map_and_submit - go through given space, map them - * if necessary, and then submit them for I/O - * - * @mpd - bh describing space - * - * The function skips space we know is already mapped to disk blocks. - * - */ -static void mpage_da_map_and_submit(struct mpage_da_data *mpd) -{ - int err, blks, get_blocks_flags; - struct ext4_map_blocks map, *mapp = NULL; - sector_t next = mpd->b_blocknr; - unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; - loff_t disksize = EXT4_I(mpd->inode)->i_disksize; - handle_t *handle = NULL; - - /* - * If the blocks are mapped already, or we couldn't accumulate - * any blocks, then proceed immediately to the submission stage. - */ - if ((mpd->b_size == 0) || - ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay)) && - !(mpd->b_state & (1 << BH_Unwritten)))) - goto submit_io; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - - /* - * Call ext4_map_blocks() to allocate any delayed allocation - * blocks, or to convert an uninitialized extent to be - * initialized (in the case where we have written into - * one or more preallocated blocks). - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to - * indicate that we are on the delayed allocation path. This - * affects functions in many different parts of the allocation - * call path. This flag exists primarily because we don't - * want to change *many* call functions, so ext4_map_blocks() - * will set the EXT4_STATE_DELALLOC_RESERVED flag once the - * inode's allocation semaphore is taken. - * - * If the blocks in questions were delalloc blocks, set - * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting - * variables are updated after the blocks have been allocated. - */ - map.m_lblk = next; - map.m_len = max_blocks; - /* - * We're in delalloc path and it is possible that we're going to - * need more metadata blocks than previously reserved. However - * we must not fail because we're in writeback and there is - * nothing we can do about it so it might result in data loss. - * So use reserved blocks to allocate metadata if possible. - */ - get_blocks_flags = EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_METADATA_NOFAIL; - if (ext4_should_dioread_nolock(mpd->inode)) - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - - - blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); - if (blks < 0) { - struct super_block *sb = mpd->inode->i_sb; - - err = blks; - /* - * If get block returns EAGAIN or ENOSPC and there - * appears to be free blocks we will just let - * mpage_da_submit_io() unlock all of the pages. - */ - if (err == -EAGAIN) - goto submit_io; - - if (err == -ENOSPC && ext4_count_free_clusters(sb)) { - mpd->retval = err; - goto submit_io; - } - - /* - * get block failure will cause us to loop in - * writepages, because a_ops->writepage won't be able - * to make progress. The page will be redirtied by - * writepage and writepages will again try to write - * the same. - */ - if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { - ext4_msg(sb, KERN_CRIT, - "delayed block allocation failed for inode %lu " - "at logical offset %llu with max blocks %zd " - "with error %d", mpd->inode->i_ino, - (unsigned long long) next, - mpd->b_size >> mpd->inode->i_blkbits, err); - ext4_msg(sb, KERN_CRIT, - "This should not happen!! Data will be lost"); - if (err == -ENOSPC) - ext4_print_free_blocks(mpd->inode); - } - /* invalidate all the pages */ - ext4_da_block_invalidatepages(mpd); - - /* Mark this page range as having been completed */ - mpd->io_done = 1; - return; - } - BUG_ON(blks == 0); - - mapp = ↦ - if (map.m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = mpd->inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map.m_len; i++) - unmap_underlying_metadata(bdev, map.m_pblk + i); - } - - /* - * Update on-disk size along with block allocation. - */ - disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; - if (disksize > i_size_read(mpd->inode)) - disksize = i_size_read(mpd->inode); - if (disksize > EXT4_I(mpd->inode)->i_disksize) { - ext4_update_i_disksize(mpd->inode, disksize); - err = ext4_mark_inode_dirty(handle, mpd->inode); - if (err) - ext4_error(mpd->inode->i_sb, - "Failed to mark inode %lu dirty", - mpd->inode->i_ino); - } - -submit_io: - mpage_da_submit_io(mpd, mapp); - mpd->io_done = 1; -} - -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ - (1 << BH_Delay) | (1 << BH_Unwritten)) - -/* - * mpage_add_bh_to_extent - try to add one more block to extent of blocks - * - * @mpd->lbh - extent of blocks - * @logical - logical number of the block in the file - * @b_state - b_state of the buffer head added - * - * the function is used to collect contig. blocks in same state - */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, - unsigned long b_state) -{ - sector_t next; - int blkbits = mpd->inode->i_blkbits; - int nrblocks = mpd->b_size >> blkbits; - - /* - * XXX Don't go larger than mballoc is willing to allocate - * This is a stopgap solution. We eventually need to fold - * mpage_da_submit_io() into this function and then call - * ext4_map_blocks() multiple times in a loop - */ - if (nrblocks >= (8*1024*1024 >> blkbits)) - goto flush_it; - - /* check if the reserved journal credits might overflow */ - if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { - if (nrblocks >= EXT4_MAX_TRANS_DATA) { - /* - * With non-extent format we are limited by the journal - * credit available. Total credit needed to insert - * nrblocks contiguous blocks is dependent on the - * nrblocks. So limit nrblocks. - */ - goto flush_it; - } - } - /* - * First block in the extent - */ - if (mpd->b_size == 0) { - mpd->b_blocknr = logical; - mpd->b_size = 1 << blkbits; - mpd->b_state = b_state & BH_FLAGS; - return; - } - - next = mpd->b_blocknr + nrblocks; - /* - * Can we merge the block to our big extent? - */ - if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { - mpd->b_size += 1 << blkbits; - return; - } - -flush_it: - /* - * We couldn't merge the block to our extent, so we - * need to flush current extent and start new one - */ - mpage_da_map_and_submit(mpd); - return; -} - static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) { return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); @@ -1888,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, "logical block %lu\n", inode->i_ino, map->m_len, (unsigned long) map->m_lblk); + ext4_es_lru_add(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, &es)) { @@ -2161,7 +1804,7 @@ out: * lock so we have to do some magic. * * This function can get called via... - * - ext4_da_writepages after taking page lock (have journal handle) + * - ext4_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via the kswapd/direct reclaim (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) @@ -2243,6 +1886,7 @@ static int ext4_writepage(struct page *page, io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_submit.io_end) { redirty_page_for_writepage(wbc, page); + unlock_page(page); return -ENOMEM; } ret = ext4_bio_write_page(&io_submit, page, len, wbc); @@ -2252,70 +1896,391 @@ static int ext4_writepage(struct page *page, return ret; } +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) + /* - * This is called via ext4_da_writepages() to - * calculate the total number of credits to reserve to fit - * a single extent allocation into a single transaction, - * ext4_da_writpeages() will loop calling this before - * the block allocation. + * mballoc gives us at most this number of blocks... + * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). + * The rest of mballoc seems to handle chunks upto full group size. */ +#define MAX_WRITEPAGES_EXTENT_LEN 2048 -static int ext4_da_writepages_trans_blocks(struct inode *inode) +/* + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map + * + * @mpd - extent of blocks + * @lblk - logical number of the block in the file + * @b_state - b_state of the buffer head added + * + * the function is used to collect contig. blocks in same state + */ +static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + unsigned long b_state) +{ + struct ext4_map_blocks *map = &mpd->map; + + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return 0; + + /* First block in the extent? */ + if (map->m_len == 0) { + map->m_lblk = lblk; + map->m_len = 1; + map->m_flags = b_state & BH_FLAGS; + return 1; + } + + /* Can we merge the block to our big extent? */ + if (lblk == map->m_lblk + map->m_len && + (b_state & BH_FLAGS) == map->m_flags) { + map->m_len++; + return 1; + } + return 0; +} + +static bool add_page_bufs_to_extent(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) +{ + struct inode *inode = mpd->inode; + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + >> inode->i_blkbits; + + do { + BUG_ON(buffer_locked(bh)); + + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh)) || + lblk >= blocks) { + /* Found extent to map? */ + if (mpd->map.m_len) + return false; + if (lblk >= blocks) + return true; + continue; + } + if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state)) + return false; + } while (lblk++, (bh = bh->b_this_page) != head); + return true; +} + +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ + int len; + loff_t size = i_size_read(mpd->inode); + int err; + + BUG_ON(page->index != mpd->first_page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + clear_page_dirty_for_io(page); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); + if (!err) + mpd->wbc->nr_to_write--; + mpd->first_page++; + + return err; +} + +/* + * mpage_map_buffers - update buffers corresponding to changed extent and + * submit fully mapped pages for IO + * + * @mpd - description of extent to map, on return next extent to map + * + * Scan buffers corresponding to changed extent (we expect corresponding pages + * to be already locked) and update buffer state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits, + * and mark buffers as uninit when we perform writes to uninitialized extents + * and do extent conversion after IO is finished. If the last page is not fully + * mapped, we update @map to the next extent in the last page that needs + * mapping. Otherwise we submit the page for IO. + */ +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) +{ + struct pagevec pvec; + int nr_pages, i; + struct inode *inode = mpd->inode; + struct buffer_head *head, *bh; + int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + >> inode->i_blkbits; + pgoff_t start, end; + ext4_lblk_t lblk; + sector_t pblock; + int err; + + start = mpd->map.m_lblk >> bpp_bits; + end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + lblk = start << bpp_bits; + pblock = mpd->map.m_pblk; + + pagevec_init(&pvec, 0); + while (start <= end) { + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, + PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) + break; + /* Upto 'end' pages must be contiguous */ + BUG_ON(page->index != start); + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + add_page_bufs_to_extent(mpd, head, bh, + lblk); + pagevec_release(&pvec); + return 0; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + } while (++lblk < blocks && + (bh = bh->b_this_page) != head); + + /* + * FIXME: This is going to break if dioread_nolock + * supports blocksize < pagesize as we will try to + * convert potentially unmapped parts of inode. + */ + mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; + /* Page fully mapped - let IO run! */ + err = mpage_submit_page(mpd, page); + if (err < 0) { + pagevec_release(&pvec); + return err; + } + start++; + } + pagevec_release(&pvec); + } + /* Extent fully mapped and matches with page boundary. We are done. */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + return 0; +} + +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { - int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int get_blocks_flags; + int err; + trace_ext4_da_write_pages_extent(inode, map); /* - * With non-extent format the journal credit needed to - * insert nrblocks contiguous block is dependent on - * number of contiguous block. So we will limit - * number of contiguous block to a sane value + * Call ext4_map_blocks() to allocate any delayed allocation blocks, or + * to convert an uninitialized extent to be initialized (in the case + * where we have written into one or more preallocated blocks). It is + * possible that we're going to need more metadata blocks than + * previously reserved. However we must not fail because we're in + * writeback and there is nothing we can do about it so it might result + * in data loss. So use reserved blocks to allocate metadata if + * possible. + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks + * in question are delalloc blocks. This affects functions in many + * different parts of the allocation call path. This flag exists + * primarily because we don't want to change *many* call functions, so + * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag + * once the inode's allocation semaphore is taken. */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && - (max_blocks > EXT4_MAX_TRANS_DATA)) - max_blocks = EXT4_MAX_TRANS_DATA; + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL; + if (ext4_should_dioread_nolock(inode)) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + if (map->m_flags & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + err = ext4_map_blocks(handle, inode, map, get_blocks_flags); + if (err < 0) + return err; + if (map->m_flags & EXT4_MAP_UNINIT) { + if (!mpd->io_submit.io_end->handle && + ext4_handle_valid(handle)) { + mpd->io_submit.io_end->handle = handle->h_rsv_handle; + handle->h_rsv_handle = NULL; + } + ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + } - return ext4_chunk_trans_blocks(inode, max_blocks); + BUG_ON(map->m_len == 0); + if (map->m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = inode->i_sb->s_bdev; + int i; + + for (i = 0; i < map->m_len; i++) + unmap_underlying_metadata(bdev, map->m_pblk + i); + } + return 0; } /* - * write_cache_pages_da - walk the list of dirty pages of the given - * address space and accumulate pages that need writing, and call - * mpage_da_map_and_submit to map a single contiguous memory region - * and then write them. + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length + * mpd->len and submit pages underlying it for IO + * + * @handle - handle for journal operations + * @mpd - extent to map + * + * The function maps extent starting at mpd->lblk of length mpd->len. If it is + * delayed, blocks are allocated, if it is unwritten, we may need to convert + * them to initialized or split the described range from larger unwritten + * extent. Note that we need not map all the described range since allocation + * can return less blocks or the range is covered by more unwritten extents. We + * cannot map more because we are limited by reserved transaction credits. On + * the other hand we always make sure that the last touched page is fully + * mapped so that it can be written out (and thus forward progress is + * guaranteed). After mapping we submit all mapped pages for IO. */ -static int write_cache_pages_da(handle_t *handle, - struct address_space *mapping, - struct writeback_control *wbc, - struct mpage_da_data *mpd, - pgoff_t *done_index) +static int mpage_map_and_submit_extent(handle_t *handle, + struct mpage_da_data *mpd, + bool *give_up_on_write) { - struct buffer_head *bh, *head; - struct inode *inode = mapping->host; - struct pagevec pvec; - unsigned int nr_pages; - sector_t logical; - pgoff_t index, end; - long nr_to_write = wbc->nr_to_write; - int i, tag, ret = 0; - - memset(mpd, 0, sizeof(struct mpage_da_data)); - mpd->wbc = wbc; - mpd->inode = inode; - pagevec_init(&pvec, 0); - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int err; + loff_t disksize; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + mpd->io_submit.io_end->offset = + ((loff_t)map->m_lblk) << inode->i_blkbits; + while (map->m_len) { + err = mpage_map_one_extent(handle, mpd); + if (err < 0) { + struct super_block *sb = inode->i_sb; + + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + goto invalidate_dirty_pages; + /* + * Let the uper layers retry transient errors. + * In the case of ENOSPC, if ext4_count_free_blocks() + * is non-zero, a commit should free up blocks. + */ + if ((err == -ENOMEM) || + (err == -ENOSPC && ext4_count_free_clusters(sb))) + return err; + ext4_msg(sb, KERN_CRIT, + "Delayed block allocation failed for " + "inode %lu at logical offset %llu with" + " max blocks %u with error %d", + inode->i_ino, + (unsigned long long)map->m_lblk, + (unsigned)map->m_len, -err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will " + "be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(inode); + invalidate_dirty_pages: + *give_up_on_write = true; + return err; + } + /* + * Update buffer state, submit mapped pages, and get us new + * extent to map + */ + err = mpage_map_and_submit_buffers(mpd); + if (err < 0) + return err; + } + + /* Update on-disk size after IO is submitted */ + disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + int err2; + + ext4_update_i_disksize(inode, disksize); + err2 = ext4_mark_inode_dirty(handle, inode); + if (err2) + ext4_error(inode->i_sb, + "Failed to mark inode %lu dirty", + inode->i_ino); + if (!err) + err = err2; + } + return err; +} + +/* + * Calculate the total number of credits to reserve for one writepages + * iteration. This is called from ext4_writepages(). We map an extent of + * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + + * bpp - 1 blocks in bpp different extents. + */ +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + + return ext4_meta_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); +} + +/* + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages + * and underlying extent to map + * + * @mpd - where to look for pages + * + * Walk dirty pages in the mapping. If they are fully mapped, submit them for + * IO immediately. When we find a page which isn't mapped we start accumulating + * extent of buffers underlying these pages that needs mapping (formed by + * either delayed or unwritten buffers). We also lock the pages containing + * these buffers. The extent found is returned in @mpd structure (starting at + * mpd->lblk with length mpd->len blocks). + * + * Note that this function can attach bios to one io_end structure which are + * neither logically nor physically contiguous. Although it may seem as an + * unnecessary complication, it is actually inevitable in blocksize < pagesize + * case as we need to track IO to all buffers underlying a page in one io_end. + */ +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct pagevec pvec; + unsigned int nr_pages; + pgoff_t index = mpd->first_page; + pgoff_t end = mpd->last_page; + int tag; + int i, err = 0; + int blkbits = mpd->inode->i_blkbits; + ext4_lblk_t lblk; + struct buffer_head *head; + + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - *done_index = index; + pagevec_init(&pvec, 0); + mpd->map.m_len = 0; + mpd->next_page = index; while (index <= end) { nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) - return 0; + goto out; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2330,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle, if (page->index > end) goto out; - *done_index = page->index + 1; - - /* - * If we can't merge this page, and we have - * accumulated an contiguous region, write it - */ - if ((mpd->next_page != page->index) && - (mpd->next_page != mpd->first_page)) { - mpage_da_map_and_submit(mpd); - goto ret_extent_tail; - } + /* If we can't merge this page, we are done. */ + if (mpd->map.m_len > 0 && mpd->next_page != page->index) + goto out; lock_page(page); - /* - * If the page is no longer dirty, or its - * mapping no longer corresponds to inode we - * are writing (which means it has been - * truncated or invalidated), or the page is - * already under writeback and we are not - * doing a data integrity writeback, skip the page + * If the page is no longer dirty, or its mapping no + * longer corresponds to inode we are writing (which + * means it has been truncated or invalidated), or the + * page is already under writeback and we are not doing + * a data integrity writeback, skip the page */ if (!PageDirty(page) || (PageWriteback(page) && - (wbc->sync_mode == WB_SYNC_NONE)) || + (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(page->mapping != mapping)) { unlock_page(page); continue; @@ -2363,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle, wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); - /* - * If we have inline data and arrive here, it means that - * we will soon create the block for the 1st page, so - * we'd better clear the inline data here. - */ - if (ext4_has_inline_data(inode)) { - BUG_ON(ext4_test_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA)); - ext4_destroy_inline_data(handle, inode); - } - - if (mpd->next_page != page->index) + if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; - logical = (sector_t) page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); - /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << + (PAGE_CACHE_SHIFT - blkbits); head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); - /* - * We need to try to allocate unmapped blocks - * in the same page. Otherwise we won't make - * progress with the page in ext4_writepage - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_state); - if (mpd->io_done) - goto ret_extent_tail; - } else if (buffer_dirty(bh) && - buffer_mapped(bh)) { - /* - * mapped dirty buffer. We need to - * update the b_state because we look - * at b_state in mpage_da_map_blocks. - * We don't update b_size because if we - * find an unmapped buffer_head later - * we need to use the b_state flag of - * that buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = - bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - - if (nr_to_write > 0) { - nr_to_write--; - if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ + if (!add_page_bufs_to_extent(mpd, head, head, lblk)) + goto out; + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_page(mpd, page); + if (err < 0) goto out; } + + /* + * Accumulated enough dirty pages? This doesn't apply + * to WB_SYNC_ALL mode. For integrity sync we have to + * keep going because someone may be concurrently + * dirtying pages, and we might have synced a lot of + * newly appeared dirty pages, but have not synced all + * of the old dirty pages. + */ + if (mpd->wbc->sync_mode == WB_SYNC_NONE && + mpd->next_page - mpd->first_page >= + mpd->wbc->nr_to_write) + goto out; } pagevec_release(&pvec); cond_resched(); } return 0; -ret_extent_tail: - ret = MPAGE_DA_EXTENT_TAIL; out: pagevec_release(&pvec); - cond_resched(); - return ret; + return err; } +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = ext4_writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} -static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - pgoff_t index; + pgoff_t writeback_index = 0; + long nr_to_write = wbc->nr_to_write; int range_whole = 0; + int cycled = 1; handle_t *handle = NULL; struct mpage_da_data mpd; struct inode *inode = mapping->host; - int pages_written = 0; - unsigned int max_pages; - int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0; - long desired_nr_to_write, nr_to_writebump = 0; - loff_t range_start = wbc->range_start; + int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - pgoff_t done_index = 0; - pgoff_t end; + bool done; struct blk_plug plug; + bool give_up_on_write = false; - trace_ext4_da_writepages(inode, wbc); + trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting @@ -2472,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping, if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; + if (ext4_should_journal_data(inode)) { + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + return ret; + } + /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because * the latter could be true if the filesystem is mounted - * read-only, and in that case, ext4_da_writepages should + * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) return -EROFS; + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert upto one extent per block in + * the page and we may dirty the inode. + */ + rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); + } + + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + /* Just inode will be modified... */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_writepages; + } + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + ext4_journal_stop(handle); + } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - range_cyclic = wbc->range_cyclic; if (wbc->range_cyclic) { - index = mapping->writeback_index; - if (index) + writeback_index = mapping->writeback_index; + if (writeback_index) cycled = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = LLONG_MAX; - wbc->range_cyclic = 0; - end = -1; + mpd.first_page = writeback_index; + mpd.last_page = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - } - - /* - * This works around two forms of stupidity. The first is in - * the writeback code, which caps the maximum number of pages - * written to be 1024 pages. This is wrong on multiple - * levels; different architectues have a different page size, - * which changes the maximum amount of data which gets - * written. Secondly, 4 megabytes is way too small. XFS - * forces this value to be 16 megabytes by multiplying - * nr_to_write parameter by four, and then relies on its - * allocator to allocate larger extents to make them - * contiguous. Unfortunately this brings us to the second - * stupidity, which is that ext4's mballoc code only allocates - * at most 2048 blocks. So we force contiguous writes up to - * the number of dirty blocks in the inode, or - * sbi->max_writeback_mb_bump whichever is smaller. - */ - max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); - if (!range_cyclic && range_whole) { - if (wbc->nr_to_write == LONG_MAX) - desired_nr_to_write = wbc->nr_to_write; - else - desired_nr_to_write = wbc->nr_to_write * 8; - } else - desired_nr_to_write = ext4_num_dirty_pages(inode, index, - max_pages); - if (desired_nr_to_write > max_pages) - desired_nr_to_write = max_pages; - - if (wbc->nr_to_write < desired_nr_to_write) { - nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; - wbc->nr_to_write = desired_nr_to_write; + mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; } + mpd.inode = inode; + mpd.wbc = wbc; + ext4_io_submit_init(&mpd.io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); - + tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); + done = false; blk_start_plug(&plug); - while (!ret && wbc->nr_to_write > 0) { + while (!done && mpd.first_page <= mpd.last_page) { + /* For each extent of pages we use new io_end */ + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd.io_submit.io_end) { + ret = -ENOMEM; + break; + } /* - * we insert one extent at a time. So we need - * credit needed for single extent allocation. - * journalled mode is currently not supported - * by delalloc + * We have two constraints: We find one extent to map and we + * must always write out whole page (makes a difference when + * blocksize < pagesize) so that we don't block on IO when we + * try to write out the rest of the page. Journalled mode is + * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); - /* start a new transaction*/ - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - needed_blocks); + /* start a new transaction */ + handle = ext4_journal_start_with_reserve(inode, + EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); - blk_finish_plug(&plug); - goto out_writepages; + /* Release allocated io_end */ + ext4_put_io_end(mpd.io_submit.io_end); + break; } - /* - * Now call write_cache_pages_da() to find the next - * contiguous region of logical blocks that need - * blocks to be allocated by ext4 and submit them. - */ - ret = write_cache_pages_da(handle, mapping, - wbc, &mpd, &done_index); - /* - * If we have a contiguous extent of pages and we - * haven't done the I/O yet, map the blocks and submit - * them for I/O. - */ - if (!mpd.io_done && mpd.next_page != mpd.first_page) { - mpage_da_map_and_submit(&mpd); - ret = MPAGE_DA_EXTENT_TAIL; + trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); + ret = mpage_prepare_extent_to_map(&mpd); + if (!ret) { + if (mpd.map.m_len) + ret = mpage_map_and_submit_extent(handle, &mpd, + &give_up_on_write); + else { + /* + * We scanned the whole range (or exhausted + * nr_to_write), submitted what was mapped and + * didn't find anything needing mapping. We are + * done. + */ + done = true; + } } - trace_ext4_da_write_pages(inode, &mpd); - wbc->nr_to_write -= mpd.pages_written; - ext4_journal_stop(handle); - - if ((mpd.retval == -ENOSPC) && sbi->s_journal) { - /* commit the transaction which would + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, give_up_on_write); + /* Drop our io_end reference we got from init */ + ext4_put_io_end(mpd.io_submit.io_end); + + if (ret == -ENOSPC && sbi->s_journal) { + /* + * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; - } else if (ret == MPAGE_DA_EXTENT_TAIL) { - /* - * Got one extent now try with rest of the pages. - * If mpd.retval is set -EIO, journal is aborted. - * So we don't need to write any more. - */ - pages_written += mpd.pages_written; - ret = mpd.retval; - io_done = 1; - } else if (wbc->nr_to_write) - /* - * There is no more writeout needed - * or we requested for a noblocking writeout - * and we found the device congested - */ + continue; + } + /* Fatal error - ENOMEM, EIO... */ + if (ret) break; } blk_finish_plug(&plug); - if (!io_done && !cycled) { + if (!ret && !cycled) { cycled = 1; - index = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = mapping->writeback_index - 1; + mpd.last_page = writeback_index - 1; + mpd.first_page = 0; goto retry; } /* Update index */ - wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* - * set the writeback_index so that range_cyclic + * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = done_index; + mapping->writeback_index = mpd.first_page; out_writepages: - wbc->nr_to_write -= nr_to_writebump; - wbc->range_start = range_start; - trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); return ret; } @@ -2841,7 +2761,8 @@ static int ext4_da_write_end(struct file *file, return ret ? ret : copied; } -static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +static void ext4_da_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { /* * Drop reserved blocks @@ -2850,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset) if (!page_has_buffers(page)) goto out; - ext4_da_page_release_reservation(page, offset); + ext4_da_page_release_reservation(page, offset, length); out: - ext4_invalidatepage(page, offset); + ext4_invalidatepage(page, offset, length); return; } @@ -2876,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode) * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * - * ext4_da_writepages() -> + * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() @@ -3001,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_invalidatepage(struct page *page, unsigned long offset) +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - trace_ext4_invalidatepage(page, offset); + trace_ext4_invalidatepage(page, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); - block_invalidatepage(page, offset); + block_invalidatepage(page, offset, length); } static int __ext4_journalled_invalidatepage(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); - trace_ext4_journalled_invalidatepage(page, offset); + trace_ext4_journalled_invalidatepage(page, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - return jbd2_journal_invalidatepage(journal, page, offset); + return jbd2_journal_invalidatepage(journal, page, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidatepage(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { - WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); + WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3141,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, BUG_ON(iocb->private == NULL); + /* + * Make all waiters for direct IO properly wait also for extent + * conversion. This also disallows race between truncate() and + * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + */ + if (rw == WRITE) + atomic_inc(&inode->i_dio_count); + /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); if (overwrite) { - atomic_inc(&inode->i_dio_count); down_read(&EXT4_I(inode)->i_data_sem); mutex_unlock(&inode->i_mutex); } @@ -3216,11 +3147,19 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ext4_inode_aio_set(inode, NULL); ext4_put_io_end(io_end); /* - * In case of error or no write ext4_end_io_dio() was not + * When no IO was submitted ext4_end_io_dio() was not * called so we have to put iocb's reference. */ - if (ret <= 0 && ret != -EIOCBQUEUED) { + if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { WARN_ON(iocb->private != io_end); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->iocb); + /* + * Generic code already did inode_dio_done() so we + * have to clear EXT4_IO_END_DIRECT to not do it for + * the second time. + */ + io_end->flag = 0; ext4_put_io_end(io_end); iocb->private = NULL; } @@ -3232,7 +3171,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * for non AIO case, since the IO is already * completed, we could do the conversion right here */ - err = ext4_convert_unwritten_extents(inode, + err = ext4_convert_unwritten_extents(NULL, inode, offset, ret); if (err < 0) ret = err; @@ -3240,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, } retake_lock: + if (rw == WRITE) + inode_dio_done(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { - inode_dio_done(inode); up_read(&EXT4_I(inode)->i_data_sem); mutex_lock(&inode->i_mutex); } @@ -3301,6 +3241,7 @@ static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, + .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .bmap = ext4_bmap, @@ -3316,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, + .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .set_page_dirty = ext4_journalled_set_page_dirty, @@ -3331,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, - .writepages = ext4_da_writepages, + .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .bmap = ext4_bmap, @@ -3364,89 +3306,56 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } - /* - * ext4_discard_partial_page_buffers() - * Wrapper function for ext4_discard_partial_page_buffers_no_lock. - * This function finds and locks the page containing the offset - * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. - * Calling functions that already have the page locked should call - * ext4_discard_partial_page_buffers_no_lock directly. + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. */ -int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags) +int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) { + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; struct inode *inode = mapping->host; - struct page *page; - int err = 0; - - page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); - if (!page) - return -ENOMEM; - err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, - from, length, flags); + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); - unlock_page(page); - page_cache_release(page); - return err; + return ext4_block_zero_page_range(handle, mapping, from, length); } /* - * ext4_discard_partial_page_buffers_no_lock() - * Zeros a page range of length 'length' starting from offset 'from'. - * Buffer heads that correspond to the block aligned regions of the - * zeroed range will be unmapped. Unblock aligned regions - * will have the corresponding buffer head mapped if needed so that - * that region of the page can be updated with the partial zero out. - * - * This function assumes that the page has already been locked. The - * The range to be discarded must be contained with in the given page. - * If the specified range exceeds the end of the page it will be shortened - * to the end of the page that corresponds to 'from'. This function is - * appropriate for updating a page and it buffer heads to be unmapped and - * zeroed for blocks that have been either released, or are going to be - * released. - * - * handle: The journal handle - * inode: The files inode - * page: A locked page that contains the offset "from" - * from: The starting byte offset (from the beginning of the file) - * to begin discarding - * len: The length of bytes to discard - * flags: Optional flags that may be used: - * - * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED - * Only zero the regions of the page whose buffer heads - * have already been unmapped. This flag is appropriate - * for updating the contents of a page whose blocks may - * have already been released, and we only want to zero - * out the regions that correspond to those released blocks. - * - * Returns zero on success or negative on failure. + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' */ -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags) +int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned int offset = from & (PAGE_CACHE_SIZE-1); - unsigned int blocksize, max, pos; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, max, pos; ext4_lblk_t iblock; + struct inode *inode = mapping->host; struct buffer_head *bh; + struct page *page; int err = 0; - blocksize = inode->i_sb->s_blocksize; - max = PAGE_CACHE_SIZE - offset; + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -ENOMEM; - if (index != page->index) - return -EINVAL; + blocksize = inode->i_sb->s_blocksize; + max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between - * 'from' and the end of the page + * 'from' and the end of the block */ if (length > max || length < 0) length = max; @@ -3464,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, iblock++; pos += blocksize; } - - pos = offset; - while (pos < offset + length) { - unsigned int end_of_block, range_to_discard; - - err = 0; - - /* The length of space left to zero and unmap */ - range_to_discard = offset + length - pos; - - /* The length of space until the end of the block */ - end_of_block = blocksize - (pos & (blocksize-1)); - - /* - * Do not unmap or zero past end of block - * for this buffer head - */ - if (range_to_discard > end_of_block) - range_to_discard = end_of_block; - - - /* - * Skip this buffer head if we are only zeroing unampped - * regions of the page - */ - if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && - buffer_mapped(bh)) - goto next; - - /* If the range is block aligned, unmap */ - if (range_to_discard == blocksize) { - clear_buffer_dirty(bh); - bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); - clear_buffer_uptodate(bh); - zero_user(page, pos, range_to_discard); - BUFFER_TRACE(bh, "Buffer discarded"); - goto next; - } - - /* - * If this block is not completely contained in the range - * to be discarded, then it is not going to be released. Because - * we need to keep this block, we need to make sure this part - * of the page is uptodate before we modify it by writeing - * partial zeros on it. - */ + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { - /* - * Buffer head must be mapped before we can read - * from the block - */ - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto next; - } + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; } + } - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt.*/ - if (!buffer_uptodate(bh)) - goto next; - } + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + zero_user(page, offset, length); + BUFFER_TRACE(bh, "zeroed end of block"); - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto next; - } + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else { + err = 0; + mark_buffer_dirty(bh); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) + err = ext4_jbd2_file_inode(handle, inode); + } - zero_user(page, pos, range_to_discard); +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); +int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t length) +{ + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned partial_start, partial_end; + ext4_fsblk_t start, end; + loff_t byte_end = (lstart + length - 1); + int err = 0; - BUFFER_TRACE(bh, "Partial buffer zeroed"); -next: - bh = bh->b_this_page; - iblock++; - pos += range_to_discard; - } + partial_start = lstart & (sb->s_blocksize - 1); + partial_end = byte_end & (sb->s_blocksize - 1); + start = lstart >> sb->s_blocksize_bits; + end = byte_end >> sb->s_blocksize_bits; + + /* Handle partial zero within the single block */ + if (start == end && + (partial_start || (partial_end != sb->s_blocksize - 1))) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, length); + return err; + } + /* Handle partial zero out on the start of the range */ + if (partial_start) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, sb->s_blocksize); + if (err) + return err; + } + /* Handle partial zero out on the end of the range */ + if (partial_end != sb->s_blocksize - 1) + err = ext4_block_zero_page_range(handle, mapping, + byte_end - partial_end, + partial_end + 1); return err; } @@ -3589,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) { - struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; + loff_t first_block_offset, last_block_offset; handle_t *handle; unsigned int credits; int ret = 0; @@ -3647,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; + first_block_offset = round_up(offset, sb->s_blocksize); + last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); - } + /* Now release the pages and zero block aligned part of pages*/ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); - ret = ext4_flush_unwritten_io(inode); - if (ret) - goto out_dio; inode_dio_wait(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -3677,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) goto out_dio; } - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the - * buffer heads for the block aligned regions of the page that - * were completely zeroed. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained - * within a page just zero out and unmap the middle of - * that page - */ - ret = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - - if (ret) - goto out_stop; - } else { - /* - * zero out and unmap the partial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - ret = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (ret) - goto out_stop; - } - - /* - * zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - ret = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (ret) - goto out_stop; - } - } - - /* - * If i_size is contained in the last page, we need to - * unmap and zero the partial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (page_len > 0) { - ret = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (ret) - goto out_stop; - } - } + ret = ext4_zero_partial_blocks(handle, inode, offset, + length); + if (ret) + goto out_stop; first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); @@ -3812,7 +3641,6 @@ void ext4_truncate(struct inode *inode) unsigned int credits; handle_t *handle; struct address_space *mapping = inode->i_mapping; - loff_t page_len; /* * There is a possibility that we're either freeing the inode @@ -3839,12 +3667,6 @@ void ext4_truncate(struct inode *inode) return; } - /* - * finish any pending end_io work so we won't run the risk of - * converting any truncated blocks to initialized later - */ - ext4_flush_unwritten_io(inode); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else @@ -3856,14 +3678,8 @@ void ext4_truncate(struct inode *inode) return; } - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0)) - goto out_stop; - } + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this @@ -4632,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) inode->i_size >> PAGE_CACHE_SHIFT); if (!page) return; - ret = __ext4_journalled_invalidatepage(page, offset); + ret = __ext4_journalled_invalidatepage(page, offset, + PAGE_CACHE_SIZE - offset); unlock_page(page); page_cache_release(page); if (ret != -EBUSY) @@ -4814,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode; - unsigned long delalloc_blocks; + unsigned long long delalloc_blocks; inode = dentry->d_inode; generic_fillattr(inode, stat); @@ -4832,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), EXT4_I(inode)->i_reserved_data_blocks); - stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9); return 0; } -static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_index_trans_blocks(struct inode *inode, int lblocks, + int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_ind_trans_blocks(inode, nrblocks, chunk); - return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, lblocks); + return ext4_ext_index_trans_blocks(inode, pextents); } /* @@ -4854,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -4862,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) int ret = 0; /* - * How many index blocks need to touch to modify nrblocks? - * The "Chunk" flag indicating whether the nrblocks is - * physically contiguous on disk - * - * For Direct IO and fallocate, they calls get_block to allocate - * one single extent at a time, so they could set the "Chunk" flag + * How many index blocks need to touch to map @lblocks logical blocks + * to @pextents physical extents? */ - idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ret = idxblocks; @@ -4877,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks; - if (chunk) - groups += 1; - else - groups += nrblocks; - + groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -4913,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) int bpp = ext4_journal_blocks_per_page(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, 0); + ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b1ed9e0..a9ff5e5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2105,7 +2105,12 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { - if (group == ngroups) + cond_resched(); + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + if (group >= ngroups) group = 0; /* This now checks without needing the buddy page */ @@ -4401,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) { - ext4_discard_allocated_blocks(ac); - goto errout; - } + if (*errp) + goto discard_and_exit; /* as we've just preallocated more space than - * user requested orinally, we store allocated + * user requested originally, we store allocated * space in a special descriptor */ if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - ext4_mb_new_preallocation(ac); + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + *errp = ext4_mb_new_preallocation(ac); + if (*errp) { + discard_and_exit: + ext4_discard_allocated_blocks(ac); + goto errout; + } } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -4608,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, BUG_ON(bh && (count > 1)); for (i = 0; i < count; i++) { + cond_resched(); if (!bh) tbh = sb_find_get_block(inode->i_sb, block + i); - if (unlikely(!tbh)) + if (!tbh) continue; ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, tbh, block + i); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 3dcbf36..e86dddb 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, struct page *pagep[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset; - long long offs = orig_page_offset << PAGE_CACHE_SHIFT; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; @@ -940,8 +939,6 @@ again: orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; - offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6653fc3..234b834 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file, bh->b_data, bh->b_size, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + ((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse(bh); - return count; + /* silently ignore the rest of the block */ + break; } ext4fs_dirhash(de->name, de->name_len, hinfo); if ((hinfo->hash < start_hash) || @@ -2299,6 +2296,45 @@ retry: return err; } +static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + inode = ext4_new_inode_start_handle(dir, mode, + NULL, 0, NULL, + EXT4_HT_DIR, + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + 4 + EXT4_XATTR_TRANS_BLOCKS); + handle = ext4_journal_current_handle(); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + err = ext4_orphan_add(handle, inode); + if (err) + goto err_drop_inode; + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + } + if (handle) + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_drop_inode: + ext4_journal_stop(handle); + unlock_new_inode(inode); + iput(inode); + return err; +} + struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, @@ -2906,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry, retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS)); + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2920,6 +2956,11 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); + /* this can happen only for tmpfile being + * linked the first time + */ + if (inode->i_nlink == 1) + ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); } else { drop_nlink(inode); @@ -3172,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = { .mkdir = ext4_mkdir, .rmdir = ext4_rmdir, .mknod = ext4_mknod, + .tmpfile = ext4_tmpfile, .rename = ext4_rename, .setattr = ext4_setattr, .setxattr = generic_setxattr, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 19599bd..48786cd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -46,29 +46,82 @@ void ext4_exit_pageio(void) } /* - * This function is called by ext4_evict_inode() to make sure there is - * no more pending I/O completion work left to do. + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... */ -void ext4_ioend_shutdown(struct inode *inode) +static void buffer_io_error(struct buffer_head *bh) { - wait_queue_head_t *wq = ext4_ioend_wq(inode); + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); - /* - * We need to make sure the work structure is finished being - * used before we let the inode get destroyed. - */ - if (work_pending(&EXT4_I(inode)->i_unwritten_work)) - cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); +static void ext4_finish_bio(struct bio *bio) +{ + int i; + int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); + + for (i = 0; i < bio->bi_vcnt; i++) { + struct bio_vec *bvec = &bio->bi_io_vec[i]; + struct page *page = bvec->bv_page; + struct buffer_head *bh, *head; + unsigned bio_start = bvec->bv_offset; + unsigned bio_end = bio_start + bvec->bv_len; + unsigned under_io = 0; + unsigned long flags; + + if (!page) + continue; + + if (error) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + } + bh = head = page_buffers(page); + /* + * We check all buffers in the page under BH_Uptodate_Lock + * to avoid races with other end io clearing async_write flags + */ + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + bh->b_size > bio_end) { + if (buffer_async_write(bh)) + under_io++; + continue; + } + clear_buffer_async_write(bh); + if (error) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + if (!under_io) + end_page_writeback(page); + } } static void ext4_release_io_end(ext4_io_end_t *io_end) { + struct bio *bio, *next_bio; + BUG_ON(!list_empty(&io_end->list)); BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + WARN_ON(io_end->handle); if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) wake_up_all(ext4_ioend_wq(io_end->inode)); + + for (bio = io_end->bio; bio; bio = next_bio) { + next_bio = bio->bi_private; + ext4_finish_bio(bio); + bio_put(bio); + } if (io_end->flag & EXT4_IO_END_DIRECT) inode_dio_done(io_end->inode); if (io_end->iocb) @@ -86,19 +139,28 @@ static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) wake_up_all(ext4_ioend_wq(inode)); } -/* check a range of space and convert unwritten extents to written. */ +/* + * Check a range of space and convert unwritten extents to written. Note that + * we are protected from truncate touching same part of extent tree by the + * fact that truncate code waits for all DIO to finish (thus exclusion from + * direct IO is achieved) and also waits for PageWriteback bits. Thus we + * cannot get to ext4_ext_truncate() before all IOs overlapping that range are + * completed (happens from ext4_free_ioend()). + */ static int ext4_end_io(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; ssize_t size = io->size; + handle_t *handle = io->handle; int ret = 0; ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - ret = ext4_convert_unwritten_extents(inode, offset, size); + io->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_extents(handle, inode, offset, size); if (ret < 0) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " @@ -111,20 +173,17 @@ static int ext4_end_io(ext4_io_end_t *io) return ret; } -static void dump_completed_IO(struct inode *inode) +static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; - if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { - ext4_debug("inode %lu completed_io list is empty\n", - inode->i_ino); + if (list_empty(head)) return; - } - ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { + ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + list_for_each_entry(io, head, list) { cur = &io->list; before = cur->prev; io0 = container_of(before, ext4_io_end_t, list); @@ -145,16 +204,23 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) unsigned long flags; BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (list_empty(&ei->i_completed_io_list)) - queue_work(wq, &ei->i_unwritten_work); - list_add_tail(&io_end->list, &ei->i_completed_io_list); + if (io_end->handle) { + wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); + } else { + wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq; + if (list_empty(&ei->i_unrsv_conversion_list)) + queue_work(wq, &ei->i_unrsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list); + } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); } -static int ext4_do_flush_completed_IO(struct inode *inode) +static int ext4_do_flush_completed_IO(struct inode *inode, + struct list_head *head) { ext4_io_end_t *io; struct list_head unwritten; @@ -163,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode) int err, ret = 0; spin_lock_irqsave(&ei->i_completed_io_lock, flags); - dump_completed_IO(inode); - list_replace_init(&ei->i_completed_io_list, &unwritten); + dump_completed_IO(inode, head); + list_replace_init(head, &unwritten); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { @@ -180,23 +246,20 @@ static int ext4_do_flush_completed_IO(struct inode *inode) } /* - * work on completed aio dio IO, to convert unwritten extents to extents + * work on completed IO, to convert unwritten extents to extents */ -void ext4_end_io_work(struct work_struct *work) +void ext4_end_io_rsv_work(struct work_struct *work) { struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, - i_unwritten_work); - ext4_do_flush_completed_IO(&ei->vfs_inode); + i_rsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); } -int ext4_flush_unwritten_io(struct inode *inode) +void ext4_end_io_unrsv_work(struct work_struct *work) { - int ret; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && - !(inode->i_state & I_FREEING)); - ret = ext4_do_flush_completed_IO(inode); - ext4_unwritten_wait(inode); - return ret; + struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, + i_unrsv_conversion_work); + ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list); } ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) @@ -228,8 +291,10 @@ int ext4_put_io_end(ext4_io_end_t *io_end) if (atomic_dec_and_test(&io_end->count)) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_extents(io_end->inode, - io_end->offset, io_end->size); + err = ext4_convert_unwritten_extents(io_end->handle, + io_end->inode, io_end->offset, + io_end->size); + io_end->handle = NULL; ext4_clear_io_unwritten_flag(io_end); } ext4_release_io_end(io_end); @@ -243,79 +308,31 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) return io_end; } -/* - * Print an buffer I/O error compatible with the fs/buffer.c. This - * provides compatibility with dmesg scrapers that look for a specific - * buffer I/O error message. We really need a unified error reporting - * structure to userspace ala Digital Unix's uerf system, but it's - * probably not going to happen in my lifetime, due to LKML politics... - */ -static void buffer_io_error(struct buffer_head *bh) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); -} - static void ext4_end_bio(struct bio *bio, int error) { ext4_io_end_t *io_end = bio->bi_private; - struct inode *inode; - int i; - int blocksize; sector_t bi_sector = bio->bi_sector; BUG_ON(!io_end); - inode = io_end->inode; - blocksize = 1 << inode->i_blkbits; - bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - for (i = 0; i < bio->bi_vcnt; i++) { - struct bio_vec *bvec = &bio->bi_io_vec[i]; - struct page *page = bvec->bv_page; - struct buffer_head *bh, *head; - unsigned bio_start = bvec->bv_offset; - unsigned bio_end = bio_start + bvec->bv_len; - unsigned under_io = 0; - unsigned long flags; - if (!page) - continue; - - if (error) { - SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); - } - bh = head = page_buffers(page); + if (io_end->flag & EXT4_IO_END_UNWRITTEN) { /* - * We check all buffers in the page under BH_Uptodate_Lock - * to avoid races with other end io clearing async_write flags + * Link bio into list hanging from io_end. We have to do it + * atomically as bio completions can be racing against each + * other. */ - local_irq_save(flags); - bit_spin_lock(BH_Uptodate_Lock, &head->b_state); - do { - if (bh_offset(bh) < bio_start || - bh_offset(bh) + blocksize > bio_end) { - if (buffer_async_write(bh)) - under_io++; - continue; - } - clear_buffer_async_write(bh); - if (error) - buffer_io_error(bh); - } while ((bh = bh->b_this_page) != head); - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); - local_irq_restore(flags); - if (!under_io) - end_page_writeback(page); + bio->bi_private = xchg(&io_end->bio, bio); + } else { + ext4_finish_bio(bio); + bio_put(bio); } - bio_put(bio); if (error) { - io_end->flag |= EXT4_IO_END_ERROR; + struct inode *inode = io_end->inode; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " "(offset %llu size %ld starting block %llu)", inode->i_ino, @@ -324,7 +341,6 @@ static void ext4_end_bio(struct bio *bio, int error) (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); } - ext4_put_io_end_defer(io_end); } @@ -356,13 +372,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io, struct bio *bio; bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); + if (!bio) + return -ENOMEM; bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); - if (!io->io_end->size) - io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT) - + bh_offset(bh); io->io_bio = bio; io->io_next_block = bh->b_blocknr; return 0; @@ -372,7 +387,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, struct buffer_head *bh) { - ext4_io_end_t *io_end; int ret; if (io->io_bio && bh->b_blocknr != io->io_next_block) { @@ -387,10 +401,6 @@ submit_and_retry: ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - io_end = io->io_end; - if (test_clear_buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); - io_end->size += bh->b_size; io->io_next_block++; return 0; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b27c96d..c5adbb3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb, ext4_fsblk_t end = start + input->blocks_count; ext4_group_t group = input->group; ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; - unsigned overhead = ext4_group_overhead_blocks(sb, group); - ext4_fsblk_t metaend = start + overhead; + unsigned overhead; + ext4_fsblk_t metaend; struct buffer_head *bh = NULL; ext4_grpblk_t free_blocks_count, offset; int err = -EINVAL; + if (group != sbi->s_groups_count) { + ext4_warning(sb, "Cannot add at group %u (only %u groups)", + input->group, sbi->s_groups_count); + return -EINVAL; + } + + overhead = ext4_group_overhead_blocks(sb, group); + metaend = start + overhead; input->free_blocks_count = free_blocks_count = input->blocks_count - 2 - overhead - sbi->s_itb_per_group; @@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb, free_blocks_count, input->reserved_blocks); ext4_get_group_no_and_offset(sb, start, NULL, &offset); - if (group != sbi->s_groups_count) - ext4_warning(sb, "Cannot add at group %u (only %u groups)", - input->group, sbi->s_groups_count); - else if (offset != 0) + if (offset != 0) ext4_warning(sb, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) ext4_warning(sb, "Reserved blocks too high (%u)", @@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; struct inode *inode = NULL; - int gdb_off, gdb_num; + int gdb_off; int err; __u16 bg_flags = 0; - gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -1656,12 +1660,10 @@ errout: err = err2; if (!err) { - ext4_fsblk_t first_block; - first_block = ext4_group_first_block_no(sb, 0); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block), 0); } return err; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 94cc84d..85b3dd6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, static void ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); +static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); @@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb) } if (test_opt(sb, ERRORS_RO)) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_flags will be visible + * before ->s_flags update + */ + smp_wmb(); sb->s_flags |= MS_RDONLY; } if (test_opt(sb, ERRORS_PANIC)) @@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function, ext4_handle_error(sb); } -void ext4_error_inode(struct inode *inode, const char *function, - unsigned int line, ext4_fsblk_t block, - const char *fmt, ...) +void __ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; struct va_format vaf; @@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function, ext4_handle_error(inode->i_sb); } -void ext4_error_file(struct file *file, const char *function, - unsigned int line, ext4_fsblk_t block, - const char *fmt, ...) +void __ext4_error_file(struct file *file, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) { va_list args; struct va_format vaf; @@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function, if ((sb->s_flags & MS_RDONLY) == 0) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + /* + * Make sure updated value of ->s_mount_flags will be visible + * before ->s_flags update + */ + smp_wmb(); + sb->s_flags |= MS_RDONLY; if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); @@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function, panic("EXT4-fs panic from previous error\n"); } -void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +void __ext4_msg(struct super_block *sb, + const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - flush_workqueue(sbi->dio_unwritten_wq); - destroy_workqueue(sbi->dio_unwritten_wq); + flush_workqueue(sbi->unrsv_conversion_wq); + flush_workqueue(sbi->rsv_conversion_wq); + destroy_workqueue(sbi->unrsv_conversion_wq); + destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); @@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } - ext4_es_unregister_shrinker(sb); + ext4_es_unregister_shrinker(sbi); del_timer(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); @@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_lru); ei->i_es_lru_nr = 0; + ei->i_touch_when = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_quota = 0; #endif ei->jinode = NULL; - INIT_LIST_HEAD(&ei->i_completed_io_list); + INIT_LIST_HEAD(&ei->i_rsv_conversion_list); + INIT_LIST_HEAD(&ei->i_unrsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); - INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); + INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); + INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work); return &ei->vfs_inode; } @@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = { .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, + .sync_fs = ext4_sync_fs_nojournal, .put_super = ext4_put_super, .statfs = ext4_statfs, .remount_fs = ext4_remount, @@ -1908,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; ext4_group_t flex_group; - unsigned int groups_per_flex = 0; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; @@ -1916,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb) sbi->s_log_groups_per_flex = 0; return 1; } - groups_per_flex = 1U << sbi->s_log_groups_per_flex; err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) @@ -2164,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb, list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); dquot_initialize(inode); if (inode->i_nlink) { - ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", - __func__, inode->i_ino, inode->i_size); + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); mutex_lock(&inode->i_mutex); + truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); mutex_unlock(&inode->i_mutex); nr_truncates++; } else { - ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", - __func__, inode->i_ino); + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); jbd_debug(2, "deleting unreferenced inode %lu\n", inode->i_ino); nr_orphans++; @@ -2377,7 +2396,10 @@ struct ext4_attr { ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, const char *, size_t); - int offset; + union { + int offset; + int deprecated_val; + } u; }; static int parse_strtoull(const char *buf, @@ -2446,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, static ssize_t sbi_ui_show(struct ext4_attr *a, struct ext4_sb_info *sbi, char *buf) { - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } @@ -2455,7 +2477,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) { - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); unsigned long t; int ret; @@ -2504,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a, return count; } +static ssize_t sbi_deprecated_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); +} + #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ - .offset = offsetof(struct ext4_sb_info, _elname), \ + .u = { \ + .offset = offsetof(struct ext4_sb_info, _elname),\ + }, \ } #define EXT4_ATTR(name, mode, show, store) \ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) @@ -2520,6 +2550,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) #define EXT4_RW_ATTR_SBI_UI(name, elname) \ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) #define ATTR_LIST(name) &ext4_attr_##name.attr +#define EXT4_DEPRECATED_ATTR(_name, _val) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = sbi_deprecated_show, \ + .u = { \ + .deprecated_val = _val, \ + }, \ +} EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); @@ -2534,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); @@ -3763,7 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.data = (unsigned long) sb; /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sb); + ext4_es_register_shrinker(sbi); err = percpu_counter_init(&sbi->s_freeclusters_counter, ext4_count_free_clusters(sb)); @@ -3787,7 +3825,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_stripe = ext4_get_stripe_size(sbi); - sbi->s_max_writeback_mb_bump = 128; sbi->s_extent_max_zeroout_kb = 32; /* @@ -3915,12 +3952,20 @@ no_journal: * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ - EXT4_SB(sb)->dio_unwritten_wq = - alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!EXT4_SB(sb)->dio_unwritten_wq) { - printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + EXT4_SB(sb)->rsv_conversion_wq = + alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->rsv_conversion_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); ret = -ENOMEM; - goto failed_mount_wq; + goto failed_mount4; + } + + EXT4_SB(sb)->unrsv_conversion_wq = + alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->unrsv_conversion_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); + ret = -ENOMEM; + goto failed_mount4; } /* @@ -4074,14 +4119,17 @@ failed_mount4a: sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); - destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); + if (EXT4_SB(sb)->rsv_conversion_wq) + destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); + if (EXT4_SB(sb)->unrsv_conversion_wq) + destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); failed_mount_wq: if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3: - ext4_es_unregister_shrinker(sb); + ext4_es_unregister_shrinker(sbi); del_timer(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); @@ -4517,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; + bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); trace_ext4_sync_fs(sb, wait); - flush_workqueue(sbi->dio_unwritten_wq); + flush_workqueue(sbi->rsv_conversion_wq); + flush_workqueue(sbi->unrsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots */ dquot_writeback_dquots(sb, -1); + /* + * Data writeback is possible w/o journal transaction, so barrier must + * being sent at the end of the function. But we can skip it if + * transaction_commit will do it for us. + */ + target = jbd2_get_latest_transaction(sbi->s_journal); + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + needs_barrier = true; + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) - jbd2_log_wait_commit(sbi->s_journal, target); + ret = jbd2_log_wait_commit(sbi->s_journal, target); + } + if (needs_barrier) { + int err; + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + if (!ret) + ret = err; } + + return ret; +} + +static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) +{ + int ret = 0; + + trace_ext4_sync_fs(sb, wait); + flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); + flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq); + dquot_writeback_dquots(sb, -1); + if (wait && test_opt(sb, BARRIER)) + ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + return ret; } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index fd27e7e..e06e099 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL Linux website <http://acl.bestbits.at/>. If you don't know what Access Control Lists are, say N + +config F2FS_FS_SECURITY + bool "F2FS Security Labels" + depends on F2FS_FS_XATTR + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the f2fs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 44abc2f..b7826ec 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } } - error = f2fs_setxattr(inode, name_index, "", value, size); + error = f2fs_setxattr(inode, name_index, "", value, size, NULL); kfree(value); if (!error) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b1de01d..66a6b85 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -357,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, unsigned long blk_size = sbi->blocksize; struct f2fs_checkpoint *cp_block; unsigned long long cur_version = 0, pre_version = 0; - unsigned int crc = 0; size_t crc_offset; + __u32 crc = 0; /* Read the 1st cp block in this CP pack */ cp_page_1 = get_meta_page(sbi, cp_addr); @@ -369,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (crc_offset >= blk_size) goto invalid_cp1; - crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); + crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp1; @@ -384,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (crc_offset >= blk_size) goto invalid_cp2; - crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); + crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp2; @@ -450,13 +450,30 @@ fail_no_cp: return -EINVAL; } -void set_dirty_dir_page(struct inode *inode, struct page *page) +static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct list_head *head = &sbi->dir_inode_list; - struct dir_inode_entry *new; struct list_head *this; + list_for_each(this, head) { + struct dir_inode_entry *entry; + entry = list_entry(this, struct dir_inode_entry, list); + if (entry->inode == inode) + return -EEXIST; + } + list_add_tail(&new->list, head); +#ifdef CONFIG_F2FS_STAT_FS + sbi->n_dirty_dirs++; +#endif + return 0; +} + +void set_dirty_dir_page(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct dir_inode_entry *new; + if (!S_ISDIR(inode->i_mode)) return; retry: @@ -469,23 +486,31 @@ retry: INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) { - kmem_cache_free(inode_entry_slab, new); - goto out; - } - } - list_add_tail(&new->list, head); - sbi->n_dirty_dirs++; + if (__add_dirty_inode(inode, new)) + kmem_cache_free(inode_entry_slab, new); - BUG_ON(!S_ISDIR(inode->i_mode)); -out: inc_page_count(sbi, F2FS_DIRTY_DENTS); inode_inc_dirty_dents(inode); SetPagePrivate(page); + spin_unlock(&sbi->dir_inode_lock); +} +void add_dirty_dir_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct dir_inode_entry *new; +retry: + new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + if (!new) { + cond_resched(); + goto retry; + } + new->inode = inode; + INIT_LIST_HEAD(&new->list); + + spin_lock(&sbi->dir_inode_lock); + if (__add_dirty_inode(inode, new)) + kmem_cache_free(inode_entry_slab, new); spin_unlock(&sbi->dir_inode_lock); } @@ -499,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode) return; spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) - goto out; + if (atomic_read(&F2FS_I(inode)->dirty_dents)) { + spin_unlock(&sbi->dir_inode_lock); + return; + } list_for_each(this, head) { struct dir_inode_entry *entry; @@ -508,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode) if (entry->inode == inode) { list_del(&entry->list); kmem_cache_free(inode_entry_slab, entry); +#ifdef CONFIG_F2FS_STAT_FS sbi->n_dirty_dirs--; +#endif + break; + } + } + spin_unlock(&sbi->dir_inode_lock); + + /* Only from the recovery routine */ + if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { + clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); + iput(inode); + } +} + +struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct list_head *head = &sbi->dir_inode_list; + struct list_head *this; + struct inode *inode = NULL; + + spin_lock(&sbi->dir_inode_lock); + list_for_each(this, head) { + struct dir_inode_entry *entry; + entry = list_entry(this, struct dir_inode_entry, list); + if (entry->inode->i_ino == ino) { + inode = entry->inode; break; } } -out: spin_unlock(&sbi->dir_inode_lock); + return inode; } void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) @@ -595,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) block_t start_blk; struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; - unsigned int crc32 = 0; + __u32 crc32 = 0; void *kaddr; int i; @@ -664,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); - *(__le32 *)((unsigned char *)ckpt + - le32_to_cpu(ckpt->checksum_offset)) + *((__le32 *)((unsigned char *)ckpt + + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); start_blk = __start_cp_addr(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 91ff93b..035f9a3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { struct f2fs_inode_info *fi = F2FS_I(inode); +#ifdef CONFIG_F2FS_STAT_FS struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +#endif pgoff_t start_fofs, end_fofs; block_t start_blkaddr; @@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, return 0; } +#ifdef CONFIG_F2FS_STAT_FS sbi->total_hit_ext++; +#endif start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; start_blkaddr = fi->ext.blk_addr; @@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, else bh_result->b_size = UINT_MAX; +#ifdef CONFIG_F2FS_STAT_FS sbi->read_hit_ext++; +#endif read_unlock(&fi->ext.ext_lock); return 1; } @@ -199,7 +205,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) if (dn.data_blkaddr == NEW_ADDR) return ERR_PTR(-EINVAL); - page = grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -233,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) struct page *page; int err; +repeat: + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); + if (!page) + return ERR_PTR(-ENOMEM); + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) + if (err) { + f2fs_put_page(page, 1); return ERR_PTR(err); + } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) + if (dn.data_blkaddr == NULL_ADDR) { + f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); -repeat: - page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + } if (PageUptodate(page)) return page; @@ -274,9 +285,10 @@ repeat: * * Also, caller should grab and release a mutex by calling mutex_lock_op() and * mutex_unlock_op(). + * Note that, npage is set only by make_empty_dir. */ -struct page *get_new_data_page(struct inode *inode, pgoff_t index, - bool new_i_size) +struct page *get_new_data_page(struct inode *inode, + struct page *npage, pgoff_t index, bool new_i_size) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -284,18 +296,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, NULL, NULL, 0); + set_new_dnode(&dn, inode, npage, npage, 0); err = get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) return ERR_PTR(err); if (dn.data_blkaddr == NULL_ADDR) { if (reserve_new_block(&dn)) { - f2fs_put_dnode(&dn); + if (!npage) + f2fs_put_dnode(&dn); return ERR_PTR(-ENOSPC); } } - f2fs_put_dnode(&dn); + if (!npage) + f2fs_put_dnode(&dn); repeat: page = grab_cache_page(mapping, index); if (!page) @@ -325,6 +339,8 @@ repeat: if (new_i_size && i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + /* Only the directory inode sets new_i_size */ + set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); mark_inode_dirty_sync(inode); } return page; @@ -481,8 +497,9 @@ int do_write_data_page(struct page *page) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (old_blk_addr != NEW_ADDR && !is_cold_data(page) && - need_inplace_update(inode)) { + if (unlikely(old_blk_addr != NEW_ADDR && + !is_cold_data(page) && + need_inplace_update(inode))) { rewrite_data_page(F2FS_SB(inode->i_sb), page, old_blk_addr); } else { @@ -684,6 +701,27 @@ err: return err; } +static int f2fs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + SetPageUptodate(page); + set_page_dirty(page); + + if (pos + copied > i_size_read(inode)) { + i_size_write(inode, pos + copied); + mark_inode_dirty(inode); + update_inode_page(inode); + } + + unlock_page(page); + page_cache_release(page); + return copied; +} + static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { @@ -698,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, get_data_block_ro); } -static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) +static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -740,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = { .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, - .write_end = nobh_write_end, + .write_end = f2fs_write_end, .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 8d99437..0d6c6aa 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -175,12 +175,12 @@ get_cache: static int stat_show(struct seq_file *s, void *v) { - struct f2fs_stat_info *si, *next; + struct f2fs_stat_info *si; int i = 0; int j; mutex_lock(&f2fs_stat_mutex); - list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { + list_for_each_entry(si, &f2fs_stat_list, stat_list) { char devname[BDEVNAME_SIZE]; update_general_status(si->sbi); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1ac6b93..62f0d59 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -13,6 +13,7 @@ #include "f2fs.h" #include "node.h" #include "acl.h" +#include "xattr.h" static unsigned long dir_blocks(struct inode *inode) { @@ -215,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page = NULL; - struct f2fs_dir_entry *de = NULL; - struct f2fs_dentry_block *dentry_blk = NULL; + struct page *page; + struct f2fs_dir_entry *de; + struct f2fs_dentry_block *dentry_blk; page = get_lock_data_page(dir, 0); if (IS_ERR(page)) @@ -264,15 +265,10 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_put_page(page, 1); } -void init_dent_inode(const struct qstr *name, struct page *ipage) +static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_node *rn; - if (IS_ERR(ipage)) - return; - - wait_on_page_writeback(ipage); - /* copy name info. to this inode page */ rn = (struct f2fs_node *)page_address(ipage); rn->i.i_namelen = cpu_to_le32(name->len); @@ -280,14 +276,15 @@ void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -static int make_empty_dir(struct inode *inode, struct inode *parent) +static int make_empty_dir(struct inode *inode, + struct inode *parent, struct page *page) { struct page *dentry_page; struct f2fs_dentry_block *dentry_blk; struct f2fs_dir_entry *de; void *kaddr; - dentry_page = get_new_data_page(inode, 0, true); + dentry_page = get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); @@ -317,63 +314,76 @@ static int make_empty_dir(struct inode *inode, struct inode *parent) return 0; } -static int init_inode_metadata(struct inode *inode, +static struct page *init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { + struct page *page; + int err; + if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - int err; - err = new_inode_page(inode, name); - if (err) - return err; + page = new_inode_page(inode, name); + if (IS_ERR(page)) + return page; if (S_ISDIR(inode->i_mode)) { - err = make_empty_dir(inode, dir); - if (err) { - remove_inode_page(inode); - return err; - } + err = make_empty_dir(inode, dir, page); + if (err) + goto error; } err = f2fs_init_acl(inode, dir); - if (err) { - remove_inode_page(inode); - return err; - } + if (err) + goto error; + + err = f2fs_init_security(inode, dir, name, page); + if (err) + goto error; + + wait_on_page_writeback(page); } else { - struct page *ipage; - ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - set_cold_node(inode, ipage); - init_dent_inode(name, ipage); - f2fs_put_page(ipage, 1); + page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); + if (IS_ERR(page)) + return page; + + wait_on_page_writeback(page); + set_cold_node(inode, page); } + + init_dent_inode(name, page); + + /* + * This file should be checkpointed during fsync. + * We lost i_pino from now on. + */ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + file_lost_pino(inode); inc_nlink(inode); - update_inode_page(inode); } - return 0; + return page; + +error: + f2fs_put_page(page, 1); + remove_inode_page(inode); + return ERR_PTR(err); } static void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - bool need_dir_update = false; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { if (S_ISDIR(inode->i_mode)) { inc_nlink(dir); - need_dir_update = true; + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; if (F2FS_I(dir)->i_current_depth != current_depth) { F2FS_I(dir)->i_current_depth = current_depth; - need_dir_update = true; + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (need_dir_update) + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) update_inode_page(dir); else mark_inode_dirty(dir); @@ -423,6 +433,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; int slots = GET_DENTRY_SLOTS(namelen); + struct page *page; int err = 0; int i; @@ -448,7 +459,7 @@ start: bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - dentry_page = get_new_data_page(dir, block, true); + dentry_page = get_new_data_page(dir, NULL, block, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); @@ -465,12 +476,13 @@ start: ++level; goto start; add_dentry: - err = init_inode_metadata(inode, dir, name); - if (err) - goto fail; - wait_on_page_writeback(dentry_page); + page = init_inode_metadata(inode, dir, name); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } de = &dentry_blk->dentry[bit_pos]; de->hash_code = dentry_hash; de->name_len = cpu_to_le16(namelen); @@ -481,11 +493,14 @@ add_dentry: test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); set_page_dirty(dentry_page); - update_parent_metadata(dir, inode, current_depth); - - /* update parent inode number before releasing dentry page */ + /* we don't need to mark_inode_dirty now */ F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + + update_parent_metadata(dir, inode, current_depth); fail: + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; @@ -591,34 +606,26 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int f2fs_readdir(struct file *file, struct dir_context *ctx) { - unsigned long pos = file->f_pos; struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); - unsigned char *types = NULL; - unsigned int bit_pos = 0, start_bit_pos = 0; - int over = 0; + unsigned int bit_pos = 0; struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dir_entry *de = NULL; struct page *dentry_page = NULL; - unsigned int n = 0; + unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); unsigned char d_type = DT_UNKNOWN; - int slots; - types = f2fs_filetype_table; - bit_pos = (pos % NR_DENTRY_IN_BLOCK); - n = (pos / NR_DENTRY_IN_BLOCK); + bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); for ( ; n < npages; n++) { dentry_page = get_lock_data_page(inode, n); if (IS_ERR(dentry_page)) continue; - start_bit_pos = bit_pos; dentry_blk = kmap(dentry_page); while (bit_pos < NR_DENTRY_IN_BLOCK) { - d_type = DT_UNKNOWN; bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); @@ -626,28 +633,26 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) break; de = &dentry_blk->dentry[bit_pos]; - if (types && de->file_type < F2FS_FT_MAX) - d_type = types[de->file_type]; - - over = filldir(dirent, + if (de->file_type < F2FS_FT_MAX) + d_type = f2fs_filetype_table[de->file_type]; + else + d_type = DT_UNKNOWN; + if (!dir_emit(ctx, dentry_blk->filename[bit_pos], le16_to_cpu(de->name_len), - (n * NR_DENTRY_IN_BLOCK) + bit_pos, - le32_to_cpu(de->ino), d_type); - if (over) { - file->f_pos += bit_pos - start_bit_pos; - goto success; - } - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - bit_pos += slots; + le32_to_cpu(de->ino), d_type)) + goto stop; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos; } bit_pos = 0; - file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK; + ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); dentry_page = NULL; } -success: +stop: if (dentry_page && !IS_ERR(dentry_page)) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); @@ -659,7 +664,7 @@ success: const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = f2fs_readdir, + .iterate = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, }; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 20aab02..467d42d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -37,21 +37,35 @@ typecheck(unsigned long long, b) && \ ((long long)((a) - (b)) > 0)) -typedef u64 block_t; +typedef u32 block_t; /* + * should not change u32, since it is the on-disk block + * address format, __le32. + */ typedef u32 nid_t; struct f2fs_mount_info { unsigned int opt; }; -static inline __u32 f2fs_crc32(void *buff, size_t len) +#define CRCPOLY_LE 0xedb88320 + +static inline __u32 f2fs_crc32(void *buf, size_t len) { - return crc32_le(F2FS_SUPER_MAGIC, buff, len); + unsigned char *p = (unsigned char *)buf; + __u32 crc = F2FS_SUPER_MAGIC; + int i; + + while (len--) { + crc ^= *p++; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); + } + return crc; } -static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size) +static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) { - return f2fs_crc32(buff, buff_size) == blk_crc; + return f2fs_crc32(buf, buf_size) == blk_crc; } /* @@ -148,7 +162,7 @@ struct extent_info { * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ #define FADVISE_COLD_BIT 0x01 -#define FADVISE_CP_BIT 0x02 +#define FADVISE_LOST_PINO_BIT 0x02 struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ @@ -369,7 +383,6 @@ struct f2fs_sb_info { /* for directory inode management */ struct list_head dir_inode_list; /* dir inode list */ spinlock_t dir_inode_lock; /* for dir inode list lock */ - unsigned int n_dirty_dirs; /* # of dir inodes */ /* basic file system units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -406,12 +419,15 @@ struct f2fs_sb_info { * for stat information. * one is for the LFS mode, and the other is for the SSR mode. */ +#ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ - unsigned int last_victim[2]; /* last victim segment # */ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ int bg_gc; /* background gc calls */ + unsigned int n_dirty_dirs; /* # of dir inodes */ +#endif + unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ }; @@ -495,9 +511,17 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void mutex_lock_all(struct f2fs_sb_info *sbi) { - int i = 0; - for (; i < NR_GLOBAL_LOCKS; i++) - mutex_lock(&sbi->fs_lock[i]); + int i; + + for (i = 0; i < NR_GLOBAL_LOCKS; i++) { + /* + * This is the only time we take multiple fs_lock[] + * instances; the order is immaterial since we + * always hold cp_mutex, which serializes multiple + * such operations. + */ + mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex); + } } static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) @@ -843,9 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr) /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ + FI_UPDATE_DIR, /* should update inode block for consistency */ + FI_DELAY_IPUT, /* used for the recovery */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -878,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) return 0; } +static inline int f2fs_readonly(struct super_block *sb) +{ + return sb->s_flags & MS_RDONLY; +} + /* * file.c */ int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); void f2fs_truncate(struct inode *); +int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); +int truncate_data_blocks_range(struct dnode_of_data *, int); long f2fs_ioctl(struct file *, unsigned int, unsigned long); long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); @@ -913,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); -void init_dent_inode(const struct qstr *, struct page *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); @@ -948,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int remove_inode_page(struct inode *); -int new_inode_page(struct inode *, const struct qstr *); -struct page *new_node_page(struct dnode_of_data *, unsigned int); +struct page *new_inode_page(struct inode *, const struct qstr *); +struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); @@ -974,7 +1007,6 @@ void destroy_node_manager_caches(void); */ void f2fs_balance_fs(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); -void locate_dirty_segment(struct f2fs_sb_info *, unsigned int); void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); @@ -1011,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); +void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); +struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t); void sync_dirty_dir_inodes(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool); void init_orphan_info(struct f2fs_sb_info *); @@ -1025,7 +1059,7 @@ int reserve_new_block(struct dnode_of_data *); void update_extent_cache(block_t, struct dnode_of_data *); struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); -struct page *get_new_data_page(struct inode *, pgoff_t, bool); +struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); int do_write_data_page(struct page *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1cae864..d2d2b7d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -63,9 +63,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); mutex_unlock_op(sbi, ilock); + file_update_time(vma->vm_file); lock_page(page); if (page->mapping != inode->i_mapping || - page_offset(page) >= i_size_read(inode) || + page_offset(page) > i_size_read(inode) || !PageUptodate(page)) { unlock_page(page); err = -EFAULT; @@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, * check to see if the page is mapped already (no holes) */ if (PageMappedToDisk(page)) - goto out; - - /* fill the page */ - wait_on_page_writeback(page); + goto mapped; /* page is wholly or partially inside EOF */ if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { @@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, set_page_dirty(page); SetPageUptodate(page); - file_update_time(vma->vm_file); +mapped: + /* fill the page */ + wait_on_page_writeback(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -102,6 +102,24 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .remap_pages = generic_file_remap_pages, }; +static int get_parent_ino(struct inode *inode, nid_t *pino) +{ + struct dentry *dentry; + + inode = igrab(inode); + dentry = d_find_any_alias(inode); + iput(inode); + if (!dentry) + return 0; + + inode = igrab(dentry->d_parent->d_inode); + dput(dentry); + + *pino = inode->i_ino; + iput(inode); + return 1; +} + int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; @@ -114,7 +132,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) .for_reclaim = 0, }; - if (inode->i_sb->s_flags & MS_RDONLY) + if (f2fs_readonly(inode->i_sb)) return 0; trace_f2fs_sync_file_enter(inode); @@ -134,7 +152,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (is_cp_file(inode)) + else if (file_wrong_pino(inode)) need_cp = true; else if (!space_for_roll_forward(sbi)) need_cp = true; @@ -142,11 +160,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) need_cp = true; if (need_cp) { + nid_t pino; + /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); + if (file_wrong_pino(inode) && inode->i_nlink == 1 && + get_parent_ino(inode, &pino)) { + F2FS_I(inode)->i_pino = pino; + file_got_pino(inode); + mark_inode_dirty_sync(inode); + ret = f2fs_write_inode(inode, NULL); + if (ret) + goto out; + } } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; @@ -168,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { int nr_free = 0, ofs = dn->ofs_in_node; struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); @@ -185,10 +215,10 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) update_extent_cache(NULL_ADDR, dn); invalidate_blocks(sbi, blkaddr); - dec_valid_block_count(sbi, dn->inode, 1); nr_free++; } if (nr_free) { + dec_valid_block_count(sbi, dn->inode, nr_free); set_page_dirty(dn->node_page); sync_inode_page(dn); } @@ -291,7 +321,7 @@ void f2fs_truncate(struct inode *inode) } } -static int f2fs_getattr(struct vfsmount *mnt, +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -387,7 +417,7 @@ static void fill_zero(struct inode *inode, pgoff_t index, f2fs_balance_fs(sbi); ilock = mutex_lock_op(sbi); - page = get_new_data_page(inode, index, false); + page = get_new_data_page(inode, NULL, index, false); mutex_unlock_op(sbi, ilock); if (!IS_ERR(page)) { @@ -575,10 +605,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int ret; switch (cmd) { - case FS_IOC_GETFLAGS: + case F2FS_IOC_GETFLAGS: flags = fi->i_flags & FS_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); - case FS_IOC_SETFLAGS: + case F2FS_IOC_SETFLAGS: { unsigned int oldflags; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 1496159..35f9b1a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -76,7 +76,9 @@ static int gc_thread_func(void *data) else wait_ms = increase_sleep_time(wait_ms); +#ifdef CONFIG_F2FS_STAT_FS sbi->bg_gc++; +#endif /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) @@ -89,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; + int err = 0; if (!test_opt(sbi, BG_GC)) - return 0; + goto out; gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) - return -ENOMEM; + if (!gc_th) { + err = -ENOMEM; + goto out; + } sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { + err = PTR_ERR(gc_th->f2fs_gc_task); kfree(gc_th); sbi->gc_thread = NULL; - return -ENOMEM; } - return 0; + +out: + return err; } void stop_gc_thread(struct f2fs_sb_info *sbi) @@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int secno; + unsigned int secno, max_cost; int nsearched = 0; p.alloc_mode = alloc_mode; select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = get_max_cost(sbi, &p); + p.min_cost = max_cost = get_max_cost(sbi, &p); mutex_lock(&dirty_i->seglist_lock); @@ -280,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_cost = cost; } - if (cost == get_max_cost(sbi, &p)) + if (cost == max_cost) continue; if (nsearched++ >= MAX_VICTIM_SEARCH) { @@ -288,8 +295,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, break; } } -got_it: if (p.min_segno != NULL_SEGNO) { +got_it: if (p.alloc_mode == LFS) { secno = GET_SECNO(sbi, p.min_segno); if (gc_type == FG_GC) @@ -314,28 +321,21 @@ static const struct victim_selection default_v_ops = { static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) { - struct list_head *this; struct inode_entry *ie; - list_for_each(this, ilist) { - ie = list_entry(this, struct inode_entry, list); + list_for_each_entry(ie, ilist, list) if (ie->inode->i_ino == ino) return ie->inode; - } return NULL; } static void add_gc_inode(struct inode *inode, struct list_head *ilist) { - struct list_head *this; - struct inode_entry *new_ie, *ie; + struct inode_entry *new_ie; - list_for_each(this, ilist) { - ie = list_entry(this, struct inode_entry, list); - if (ie->inode == inode) { - iput(inode); - return; - } + if (inode == find_gc_inode(inode->i_ino, ilist)) { + iput(inode); + return; } repeat: new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 91ac7f9..2b2d45d1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -109,12 +109,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = do_read_inode(inode); if (ret) goto bad_inode; - - if (!sbi->por_doing && inode->i_nlink == 0) { - ret = -ENOENT; - goto bad_inode; - } - make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; @@ -130,8 +124,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE | - __GFP_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -199,6 +192,7 @@ void update_inode(struct inode *inode, struct page *node_page) set_cold_node(inode, node_page); set_page_dirty(node_page); + clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } int update_inode_page(struct inode *inode) @@ -224,6 +218,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; + if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + return 0; + if (wbc) f2fs_balance_fs(sbi); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 47abc97..64c0716 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -112,7 +112,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, int count = le32_to_cpu(sbi->raw_super->extension_count); for (i = 0; i < count; i++) { if (is_multimedia_file(name, extlist[i])) { - set_cold_file(inode); + file_set_cold(inode); break; } } @@ -149,8 +149,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); - if (!sbi->por_doing) - d_instantiate(dentry, inode); + d_instantiate(dentry, inode); unlock_new_inode(inode); return 0; out: @@ -173,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi); inode->i_ctime = CURRENT_TIME; - atomic_inc(&inode->i_count); + ihold(inode); set_inode_flag(F2FS_I(inode), FI_INC_LINK); ilock = mutex_lock_op(sbi); @@ -182,17 +181,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto out; - /* - * This file should be checkpointed during fsync. - * We lost i_pino from now on. - */ - set_cp_file(inode); - d_instantiate(dentry, inode); return 0; out: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); - make_bad_inode(inode); iput(inode); return err; } @@ -498,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, .rename = f2fs_rename, + .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, #ifdef CONFIG_F2FS_FS_XATTR @@ -512,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .getattr = f2fs_getattr, .setattr = f2fs_setattr, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, @@ -522,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { }; const struct inode_operations f2fs_special_inode_operations = { + .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, #ifdef CONFIG_F2FS_FS_XATTR diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3df43b4..b418aee 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -408,10 +408,13 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) level = get_node_path(index, offset, noffset); nids[0] = dn->inode->i_ino; - npage[0] = get_node_page(sbi, nids[0]); - if (IS_ERR(npage[0])) - return PTR_ERR(npage[0]); + npage[0] = dn->inode_page; + if (!npage[0]) { + npage[0] = get_node_page(sbi, nids[0]); + if (IS_ERR(npage[0])) + return PTR_ERR(npage[0]); + } parent = npage[0]; if (level != 0) nids[1] = get_nid(parent, offset[0], true); @@ -430,7 +433,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i]); + npage[i] = new_node_page(dn, noffset[i], NULL); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -803,22 +806,19 @@ int remove_inode_page(struct inode *inode) return 0; } -int new_inode_page(struct inode *inode, const struct qstr *name) +struct page *new_inode_page(struct inode *inode, const struct qstr *name) { - struct page *page; struct dnode_of_data dn; /* allocate inode page for new inode */ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - page = new_node_page(&dn, 0); - init_dent_inode(name, page); - if (IS_ERR(page)) - return PTR_ERR(page); - f2fs_put_page(page, 1); - return 0; + + /* caller should f2fs_put_page(page, 1); */ + return new_node_page(&dn, 0, NULL); } -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) +struct page *new_node_page(struct dnode_of_data *dn, + unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct address_space *mapping = sbi->node_inode->i_mapping; @@ -851,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) set_cold_node(dn->inode, page); dn->node_page = page; - sync_inode_page(dn); + if (ipage) + update_inode(dn->inode, ipage); + else + sync_inode_page(dn); set_page_dirty(page); if (ofs == 0) inc_valid_inode_count(sbi); @@ -1205,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page) return 0; } -static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) +static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -1492,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) new_ni = old_ni; new_ni.ino = ino; + if (!inc_valid_node_count(sbi, NULL, 1)) + WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR); inc_valid_inode_count(sbi); - f2fs_put_page(ipage, 1); return 0; } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0a2d72f..c65fb4f 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -275,25 +275,27 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_cold_file(struct inode *inode) +static inline int is_file(struct inode *inode, int type) { - return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; + return F2FS_I(inode)->i_advise & type; } -static inline void set_cold_file(struct inode *inode) +static inline void set_file(struct inode *inode, int type) { - F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; + F2FS_I(inode)->i_advise |= type; } -static inline int is_cp_file(struct inode *inode) +static inline void clear_file(struct inode *inode, int type) { - return F2FS_I(inode)->i_advise & FADVISE_CP_BIT; + F2FS_I(inode)->i_advise &= ~type; } -static inline void set_cp_file(struct inode *inode) -{ - F2FS_I(inode)->i_advise |= FADVISE_CP_BIT; -} +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) static inline int is_cold_data(struct page *page) { @@ -310,29 +312,16 @@ static inline void clear_cold_data(struct page *page) ClearPageChecked(page); } -static inline int is_cold_node(struct page *page) +static inline int is_node(struct page *page, int type) { void *kaddr = page_address(page); struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << COLD_BIT_SHIFT); + return le32_to_cpu(rn->footer.flag) & (1 << type); } -static inline unsigned char is_fsync_dnode(struct page *page) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << FSYNC_BIT_SHIFT); -} - -static inline unsigned char is_dent_dnode(struct page *page) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << DENT_BIT_SHIFT); -} +#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) +#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) +#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) static inline void set_cold_node(struct inode *inode, struct page *page) { @@ -346,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page) rn->footer.flag = cpu_to_le32(flag); } -static inline void set_fsync_mark(struct page *page, int mark) +static inline void set_mark(struct page *page, int mark, int type) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - if (mark) - flag |= (0x1 << FSYNC_BIT_SHIFT); - else - flag &= ~(0x1 << FSYNC_BIT_SHIFT); - rn->footer.flag = cpu_to_le32(flag); -} - -static inline void set_dentry_mark(struct page *page, int mark) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = (struct f2fs_node *)page_address(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) - flag |= (0x1 << DENT_BIT_SHIFT); + flag |= (0x1 << type); else - flag &= ~(0x1 << DENT_BIT_SHIFT); + flag &= ~(0x1 << type); rn->footer.flag = cpu_to_le32(flag); } +#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 60c8a50..d56d951 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, static int recover_dentry(struct page *ipage, struct inode *inode) { - struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); + void *kaddr = page_address(ipage); + struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; struct f2fs_inode *raw_inode = &(raw_node->i); - struct qstr name; + nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; + struct qstr name; struct page *page; - struct inode *dir; + struct inode *dir, *einode; int err = 0; - if (!is_dent_dnode(ipage)) - goto out; - - dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; + dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino); + if (!dir) { + dir = f2fs_iget(inode->i_sb, pino); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } + set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + add_dirty_dir_inode(dir); } name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; - +retry: de = f2fs_find_entry(dir, &name, &page); - if (de) { + if (de && inode->i_ino == le32_to_cpu(de->ino)) { kunmap(page); f2fs_put_page(page, 0); - } else { - err = __f2fs_add_link(dir, &name, inode); + goto out; + } + if (de) { + einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + if (IS_ERR(einode)) { + WARN_ON(1); + if (PTR_ERR(einode) == -ENOENT) + err = -EEXIST; + goto out; + } + f2fs_delete_entry(de, page, einode); + iput(einode); + goto retry; } - iput(dir); + err = __f2fs_add_link(dir, &name, inode); out: - kunmap(ipage); + f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " + "ino = %x, name = %s, dir = %lx, err = %d", + ino_of_node(ipage), raw_inode->i_name, + IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } @@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page) struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; struct f2fs_inode *raw_inode = &(raw_node->i); + if (!IS_INODE(node_page)) + return 0; + inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_size_write(inode, le64_to_cpu(raw_inode->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); @@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page) inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - return recover_dentry(node_page, inode); + if (is_dent_dnode(node_page)) + return recover_dentry(node_page, inode); + + f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", + ino_of_node(node_page), raw_inode->i_name); + return 0; } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) @@ -119,14 +145,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) lock_page(page); if (cp_ver != cpver_of_node(page)) - goto unlock_out; + break; if (!is_fsync_dnode(page)) goto next; entry = get_fsync_inode(head, ino_of_node(page)); if (entry) { - entry->blkaddr = blkaddr; if (IS_INODE(page) && is_dent_dnode(page)) set_inode_flag(F2FS_I(entry->inode), FI_INC_LINK); @@ -134,48 +159,40 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) - goto unlock_out; + break; } /* add this fsync inode to the list */ entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); if (!entry) { err = -ENOMEM; - goto unlock_out; + break; } entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); - goto unlock_out; + break; } - list_add_tail(&entry->list, head); - entry->blkaddr = blkaddr; - } - if (IS_INODE(page)) { - err = recover_inode(entry->inode, page); - if (err == -ENOENT) { - goto next; - } else if (err) { - err = -EINVAL; - goto unlock_out; - } } + entry->blkaddr = blkaddr; + + err = recover_inode(entry->inode, page); + if (err && err != -ENOENT) + break; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); } -unlock_out: unlock_page(page); out: __free_pages(page, 0); return err; } -static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, - struct list_head *head) +static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; @@ -186,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, } } -static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, - block_t blkaddr) +static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, + block_t blkaddr, struct dnode_of_data *dn) { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); struct f2fs_summary sum; - nid_t ino; + nid_t ino, nid; void *kaddr; struct inode *inode; struct page *node_page; @@ -203,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) - return; + return 0; /* Get the previous summary */ for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { @@ -222,20 +239,39 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, f2fs_put_page(sum_page, 1); } + /* Use the locked dnode page and inode */ + nid = le32_to_cpu(sum.nid); + if (dn->inode->i_ino == nid) { + struct dnode_of_data tdn = *dn; + tdn.nid = nid; + tdn.node_page = dn->inode_page; + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + truncate_data_blocks_range(&tdn, 1); + return 0; + } else if (dn->nid == nid) { + struct dnode_of_data tdn = *dn; + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + truncate_data_blocks_range(&tdn, 1); + return 0; + } + /* Get the node page */ - node_page = get_node_page(sbi, le32_to_cpu(sum.nid)); + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); bidx = start_bidx_of_node(ofs_of_node(node_page)) + - le16_to_cpu(sum.ofs_in_node); + le16_to_cpu(sum.ofs_in_node); ino = ino_of_node(node_page); f2fs_put_page(node_page, 1); /* Deallocate previous index in the node page */ inode = f2fs_iget(sbi->sb, ino); if (IS_ERR(inode)) - return; + return PTR_ERR(inode); truncate_hole(inode, bidx, bidx + 1); iput(inode); + return 0; } static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, @@ -245,7 +281,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; - int err = 0; + int err = 0, recovered = 0; int ilock; start = start_bidx_of_node(ofs_of_node(page)); @@ -283,13 +319,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* Check the previous node page having this index */ - check_index_in_prev_nodes(sbi, dest); + err = check_index_in_prev_nodes(sbi, dest, &dn); + if (err) + goto err; set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* write dummy data page */ recover_data_page(sbi, NULL, &sum, src, dest); update_extent_cache(dest, &dn); + recovered++; } dn.ofs_in_node++; } @@ -305,9 +344,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, set_page_dirty(dn.node_page); recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); +err: f2fs_put_dnode(&dn); mutex_unlock_op(sbi, ilock); - return 0; + + f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " + "recovered_data = %d blocks, err = %d", + inode->i_ino, recovered, err); + return err; } static int recover_data(struct f2fs_sb_info *sbi, @@ -340,7 +384,7 @@ static int recover_data(struct f2fs_sb_info *sbi, lock_page(page); if (cp_ver != cpver_of_node(page)) - goto unlock_out; + break; entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) @@ -348,7 +392,7 @@ static int recover_data(struct f2fs_sb_info *sbi, err = do_recover_data(sbi, entry->inode, page, blkaddr); if (err) - goto out; + break; if (entry->blkaddr == blkaddr) { iput(entry->inode); @@ -359,7 +403,6 @@ next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); } -unlock_out: unlock_page(page); out: __free_pages(page, 0); @@ -382,6 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&inode_list); /* step #1: find fsynced inode numbers */ + sbi->por_doing = 1; err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -390,13 +434,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) goto out; /* step #2: recover data */ - sbi->por_doing = 1; err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); - sbi->por_doing = 0; BUG_ON(!list_empty(&inode_list)); out: - destroy_fsync_dnodes(sbi, &inode_list); + destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); - write_checkpoint(sbi, false); + sbi->por_doing = 0; + if (!err) + write_checkpoint(sbi, false); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d8e84e4..a86d125 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -94,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, * Adding dirty entry into seglist is not critical operation. * If a given segment is one of current working segments, it won't be added. */ -void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) +static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks; @@ -126,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno, offset = 0; + unsigned int segno = -1; unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); while (1) { segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - offset); + segno + 1); if (segno >= total_segs) break; __set_test_and_free(sbi, segno); - offset = segno + 1; } mutex_unlock(&dirty_i->seglist_lock); } @@ -144,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno, offset = 0; + unsigned int segno = -1; unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); while (1) { segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - offset); + segno + 1); if (segno >= total_segs) break; - offset = segno + 1; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) dirty_i->nr_dirty[PRE]--; @@ -257,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) * This function should be resided under the curseg_mutex lock */ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, - struct f2fs_summary *sum, unsigned short offset) + struct f2fs_summary *sum) { struct curseg_info *curseg = CURSEG_I(sbi, type); void *addr = curseg->sum_blk; - addr += offset * sizeof(struct f2fs_summary); + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); return; } @@ -311,64 +309,14 @@ static void write_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type) -{ - struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE]; - unsigned int segno; - unsigned int ofs = 0; - - /* - * If there is not enough reserved sections, - * we should not reuse prefree segments. - */ - if (has_not_enough_free_secs(sbi, 0)) - return NULL_SEGNO; - - /* - * NODE page should not reuse prefree segment, - * since those information is used for SPOR. - */ - if (IS_NODESEG(type)) - return NULL_SEGNO; -next: - segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs); - ofs += sbi->segs_per_sec; - - if (segno < TOTAL_SEGS(sbi)) { - int i; - - /* skip intermediate segments in a section */ - if (segno % sbi->segs_per_sec) - goto next; - - /* skip if the section is currently used */ - if (sec_usage_check(sbi, GET_SECNO(sbi, segno))) - goto next; - - /* skip if whole section is not prefree */ - for (i = 1; i < sbi->segs_per_sec; i++) - if (!test_bit(segno + i, prefree_segmap)) - goto next; - - /* skip if whole section was not free at the last checkpoint */ - for (i = 0; i < sbi->segs_per_sec; i++) - if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks) - goto next; - - return segno; - } - return NULL_SEGNO; -} - static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int segno = curseg->segno; + unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); - if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec) - return !test_bit(segno + 1, free_i->free_segmap); + if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); return 0; } @@ -495,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) int dir = ALLOC_LEFT; write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + GET_SUM_BLOCK(sbi, segno)); if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) dir = ALLOC_RIGHT; @@ -599,11 +547,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, goto out; } - curseg->next_segno = check_prefree_segments(sbi, type); - - if (curseg->next_segno != NULL_SEGNO) - change_curseg(sbi, type, false); - else if (type == CURSEG_WARM_NODE) + if (type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); @@ -612,7 +556,10 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, else new_curseg(sbi, type, false); out: +#ifdef CONFIG_F2FS_STAT_FS sbi->segment_count[curseg->alloc_type]++; +#endif + return; } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -795,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; - else if (is_cold_data(page) || is_cold_file(inode)) + else if (is_cold_data(page) || file_is_cold(inode)) return CURSEG_COLD_DATA; else return CURSEG_WARM_DATA; @@ -844,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, * because, this function updates a summary entry in the * current summary block. */ - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + __add_sum_entry(sbi, type, sum); mutex_lock(&sit_i->sentry_lock); __refresh_next_blkoff(sbi, curseg); +#ifdef CONFIG_F2FS_STAT_FS sbi->block_count[curseg->alloc_type]++; +#endif /* * SIT information should be updated before segment allocation, @@ -943,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & (sbi->blocks_per_seg - 1); - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); @@ -980,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, } curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & (sbi->blocks_per_seg - 1); - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + __add_sum_entry(sbi, type, sum); /* change the current log to the next block addr in advance */ if (next_segno != segno) { @@ -1579,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int segno = 0, offset = 0; + unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); unsigned short valid_blocks; - while (segno < TOTAL_SEGS(sbi)) { + while (1) { /* find dirty segment based on free segmap */ - segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset); - if (segno >= TOTAL_SEGS(sbi)) + segno = find_next_inuse(free_i, total_segs, offset); + if (segno >= total_segs) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, 0); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8555f7d..75c7dc3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -34,7 +34,7 @@ static struct kmem_cache *f2fs_inode_cachep; enum { - Opt_gc_background_off, + Opt_gc_background, Opt_disable_roll_forward, Opt_discard, Opt_noheap, @@ -46,7 +46,7 @@ enum { }; static match_table_t f2fs_tokens = { - {Opt_gc_background_off, "background_gc_off"}, + {Opt_gc_background, "background_gc=%s"}, {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_discard, "discard"}, {Opt_noheap, "no_heap"}, @@ -76,6 +76,91 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +static int parse_options(struct super_block *sb, char *options) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + int arg = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, f2fs_tokens, args); + + switch (token) { + case Opt_gc_background: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strncmp(name, "on", 2)) + set_opt(sbi, BG_GC); + else if (!strncmp(name, "off", 3)) + clear_opt(sbi, BG_GC); + else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_disable_roll_forward: + set_opt(sbi, DISABLE_ROLL_FORWARD); + break; + case Opt_discard: + set_opt(sbi, DISCARD); + break; + case Opt_noheap: + set_opt(sbi, NOHEAP); + break; +#ifdef CONFIG_F2FS_FS_XATTR + case Opt_nouser_xattr: + clear_opt(sbi, XATTR_USER); + break; +#else + case Opt_nouser_xattr: + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); + break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_noacl: + clear_opt(sbi, POSIX_ACL); + break; +#else + case Opt_noacl: + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); + break; +#endif + case Opt_active_logs: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + return -EINVAL; + sbi->active_logs = arg; + break; + case Opt_disable_ext_identify: + set_opt(sbi, DISABLE_EXT_IDENTIFY); + break; + default: + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); + return -EINVAL; + } + } + return 0; +} + static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; @@ -112,6 +197,17 @@ static int f2fs_drop_inode(struct inode *inode) return generic_drop_inode(inode); } +/* + * f2fs_dirty_inode() is called from __mark_inode_dirty() + * + * We should call set_dirty_inode to write the dirty inode through write_inode. + */ +static void f2fs_dirty_inode(struct inode *inode, int flags) +{ + set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + return; +} + static void f2fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -170,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb) { int err; - if (sb->s_flags & MS_RDONLY) + if (f2fs_readonly(sb)) return 0; err = f2fs_sync_fs(sb, 1); @@ -214,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (test_opt(sbi, BG_GC)) - seq_puts(seq, ",background_gc_on"); + if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC)) + seq_printf(seq, ",background_gc=%s", "on"); else - seq_puts(seq, ",background_gc_off"); + seq_printf(seq, ",background_gc=%s", "off"); if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, DISCARD)) @@ -244,11 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static int f2fs_remount(struct super_block *sb, int *flags, char *data) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_mount_info org_mount_opt; + int err, active_logs; + + /* + * Save the old mount options in case we + * need to restore them. + */ + org_mount_opt = sbi->mount_opt; + active_logs = sbi->active_logs; + + /* parse mount options */ + err = parse_options(sb, data); + if (err) + goto restore_opts; + + /* + * Previous and new state of filesystem is RO, + * so no point in checking GC conditions. + */ + if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + goto skip; + + /* + * We stop the GC thread if FS is mounted as RO + * or if background_gc = off is passed in mount + * option. Also sync the filesystem. + */ + if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { + if (sbi->gc_thread) { + stop_gc_thread(sbi); + f2fs_sync_fs(sb, 1); + } + } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) { + err = start_gc_thread(sbi); + if (err) + goto restore_opts; + } +skip: + /* Update the POSIXACL Flag */ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; + +restore_opts: + sbi->mount_opt = org_mount_opt; + sbi->active_logs = active_logs; + return err; +} + static struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, + .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, @@ -256,6 +405,7 @@ static struct super_operations f2fs_sops = { .freeze_fs = f2fs_freeze, .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, + .remount_fs = f2fs_remount, }; static struct inode *f2fs_nfs_get_inode(struct super_block *sb, @@ -303,79 +453,6 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, - char *options) -{ - substring_t args[MAX_OPT_ARGS]; - char *p; - int arg = 0; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, f2fs_tokens, args); - - switch (token) { - case Opt_gc_background_off: - clear_opt(sbi, BG_GC); - break; - case Opt_disable_roll_forward: - set_opt(sbi, DISABLE_ROLL_FORWARD); - break; - case Opt_discard: - set_opt(sbi, DISCARD); - break; - case Opt_noheap: - set_opt(sbi, NOHEAP); - break; -#ifdef CONFIG_F2FS_FS_XATTR - case Opt_nouser_xattr: - clear_opt(sbi, XATTR_USER); - break; -#else - case Opt_nouser_xattr: - f2fs_msg(sb, KERN_INFO, - "nouser_xattr options not supported"); - break; -#endif -#ifdef CONFIG_F2FS_FS_POSIX_ACL - case Opt_noacl: - clear_opt(sbi, POSIX_ACL); - break; -#else - case Opt_noacl: - f2fs_msg(sb, KERN_INFO, "noacl options not supported"); - break; -#endif - case Opt_active_logs: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) - return -EINVAL; - sbi->active_logs = arg; - break; - case Opt_disable_ext_identify: - set_opt(sbi, DISABLE_EXT_IDENTIFY); - break; - default: - f2fs_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" or missing value", - p); - return -EINVAL; - } - } - return 0; -} - static loff_t max_file_size(unsigned bits) { loff_t result = ADDRS_PER_INODE; @@ -541,6 +618,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_sb_buf; } + sb->s_fs_info = sbi; /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; @@ -553,7 +631,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - err = parse_options(sb, sbi, (char *)data); + err = parse_options(sb, (char *)data); if (err) goto free_sb_buf; @@ -565,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; - sb->s_fs_info = sbi; sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -674,10 +751,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) "Cannot recover all fsync data errno=%ld", err); } - /* After POR, we can run background GC thread */ - err = start_gc_thread(sbi); - if (err) - goto fail; + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (!(sb->s_flags & MS_RDONLY)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto fail; + } err = f2fs_build_stats(sbi); if (err) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 0b02dce..3ab07ec 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -20,6 +20,7 @@ */ #include <linux/rwsem.h> #include <linux/f2fs_fs.h> +#include <linux/security.h> #include "f2fs.h" #include "xattr.h" @@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; break; + case F2FS_XATTR_INDEX_SECURITY: + prefix = XATTR_SECURITY_PREFIX; + prefix_len = XATTR_SECURITY_PREFIX_LEN; + break; default: return -EINVAL; } @@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, total_len = prefix_len + name_len + 1; if (list && total_len <= list_size) { memcpy(list, prefix, prefix_len); - memcpy(list+prefix_len, name, name_len); + memcpy(list + prefix_len, name, name_len); list[prefix_len + name_len] = '\0'; } return total_len; @@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, if (!capable(CAP_SYS_ADMIN)) return -EPERM; break; + case F2FS_XATTR_INDEX_SECURITY: + break; default: return -EINVAL; } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_getxattr(dentry->d_inode, type, name, - buffer, size); + return f2fs_getxattr(dentry->d_inode, type, name, buffer, size); } static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, @@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, if (!capable(CAP_SYS_ADMIN)) return -EPERM; break; + case F2FS_XATTR_INDEX_SECURITY: + break; default: return -EINVAL; } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_setxattr(dentry->d_inode, type, name, value, size); + return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL); } static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, @@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, return 0; } +#ifdef CONFIG_F2FS_FS_SECURITY +static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *page) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, (struct page *)page); + if (err < 0) + break; + } + return err; +} + +int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return security_inode_init_security(inode, dir, qstr, + &f2fs_initxattrs, ipage); +} +#endif + const struct xattr_handler f2fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = F2FS_XATTR_INDEX_USER, @@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = { .set = f2fs_xattr_advise_set, }; +const struct xattr_handler f2fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = F2FS_XATTR_INDEX_SECURITY, + .list = f2fs_xattr_generic_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler, +#endif [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, }; @@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = { &f2fs_xattr_acl_default_handler, #endif &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + &f2fs_xattr_security_handler, +#endif &f2fs_xattr_advise_handler, NULL, }; @@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, return -ENODATA; page = get_node_page(sbi, fi->i_xattr_nid); + if (IS_ERR(page)) + return PTR_ERR(page); base_addr = page_address(page); list_for_each_xattr(entry, base_addr) { @@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) return 0; page = get_node_page(sbi, fi->i_xattr_nid); + if (IS_ERR(page)) + return PTR_ERR(page); base_addr = page_address(page); list_for_each_xattr(entry, base_addr) { @@ -296,7 +347,7 @@ cleanup: } int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len) + const void *value, size_t value_len, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -335,7 +386,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); mark_inode_dirty(inode); - page = new_node_page(&dn, XATTR_NODE_OFFSET); + page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); if (IS_ERR(page)) { alloc_nid_failed(sbi, fi->i_xattr_nid); fi->i_xattr_nid = 0; @@ -435,7 +486,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } - update_inode_page(inode); + if (ipage) + update_inode(inode, ipage); + else + update_inode_page(inode); mutex_unlock_op(sbi, ilock); return 0; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 49c9558..3c0817b 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler; extern const struct xattr_handler f2fs_xattr_acl_access_handler; extern const struct xattr_handler f2fs_xattr_acl_default_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; +extern const struct xattr_handler f2fs_xattr_security_handler; extern const struct xattr_handler *f2fs_xattr_handlers[]; -extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len); -extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size); -extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, - size_t buffer_size); - +extern int f2fs_setxattr(struct inode *, int, const char *, + const void *, size_t, struct page *); +extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t); +extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #else #define f2fs_xattr_handlers NULL static inline int f2fs_setxattr(struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len) + const char *name, const void *value, size_t value_len) { return -EOPNOTSUPP; } @@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, } #endif +#ifdef CONFIG_F2FS_FS_SECURITY +extern int f2fs_init_security(struct inode *, struct inode *, + const struct qstr *, struct page *); +#else +static inline int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return 0; +} +#endif #endif /* __F2FS_XATTR_H__ */ diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 7a6f02c..3963ede 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -543,6 +543,7 @@ end_of_dir: EXPORT_SYMBOL_GPL(fat_search_long); struct fat_ioctl_filldir_callback { + struct dir_context ctx; void __user *dirent; int result; /* for dir ioctl */ @@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback { int short_len; }; -static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, - filldir_t filldir, int short_only, int both) +static int __fat_readdir(struct inode *inode, struct file *file, + struct dir_context *ctx, int short_only, + struct fat_ioctl_filldir_callback *both) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, unsigned char bufname[FAT_MAX_SHORT_SIZE]; int isvfat = sbi->options.isvfat; const char *fill_name = NULL; - unsigned long inum; - unsigned long lpos, dummy, *furrfu = &lpos; + int fake_offset = 0; loff_t cpos; int short_len = 0, fill_len = 0; int ret = 0; mutex_lock(&sbi->s_lock); - cpos = filp->f_pos; + cpos = ctx->pos; /* Fake . and .. for the root directory. */ if (inode->i_ino == MSDOS_ROOT_INO) { - while (cpos < 2) { - if (filldir(dirent, "..", cpos+1, cpos, - MSDOS_ROOT_INO, DT_DIR) < 0) - goto out; - cpos++; - filp->f_pos++; - } - if (cpos == 2) { - dummy = 2; - furrfu = &dummy; + if (!dir_emit_dots(file, ctx)) + goto out; + if (ctx->pos == 2) { + fake_offset = 1; cpos = 0; } } @@ -619,7 +614,7 @@ parse_record: int status = fat_parse_long(inode, &cpos, &bh, &de, &unicode, &nr_slots); if (status < 0) { - filp->f_pos = cpos; + ctx->pos = cpos; ret = status; goto out; } else if (status == PARSE_INVALID) @@ -639,6 +634,19 @@ parse_record: /* !both && !short_only, so we don't need shortname. */ if (!both) goto start_filldir; + + short_len = fat_parse_short(sb, de, bufname, + sbi->options.dotsOK); + if (short_len == 0) + goto record_end; + /* hack for fat_ioctl_filldir() */ + both->longname = fill_name; + both->long_len = fill_len; + both->shortname = bufname; + both->short_len = short_len; + fill_name = NULL; + fill_len = 0; + goto start_filldir; } } @@ -646,28 +654,21 @@ parse_record: if (short_len == 0) goto record_end; - if (nr_slots) { - /* hack for fat_ioctl_filldir() */ - struct fat_ioctl_filldir_callback *p = dirent; - - p->longname = fill_name; - p->long_len = fill_len; - p->shortname = bufname; - p->short_len = short_len; - fill_name = NULL; - fill_len = 0; - } else { - fill_name = bufname; - fill_len = short_len; - } + fill_name = bufname; + fill_len = short_len; start_filldir: - lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); - if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) - inum = inode->i_ino; - else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { - inum = parent_ino(filp->f_path.dentry); + if (!fake_offset) + ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); + + if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) { + if (!dir_emit_dot(file, ctx)) + goto fill_failed; + } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { + if (!dir_emit_dotdot(file, ctx)) + goto fill_failed; } else { + unsigned long inum; loff_t i_pos = fat_make_i_pos(sb, bh, de); struct inode *tmp = fat_iget(sb, i_pos); if (tmp) { @@ -675,18 +676,17 @@ start_filldir: iput(tmp); } else inum = iunique(sb, MSDOS_ROOT_INO); + if (!dir_emit(ctx, fill_name, fill_len, inum, + (de->attr & ATTR_DIR) ? DT_DIR : DT_REG)) + goto fill_failed; } - if (filldir(dirent, fill_name, fill_len, *furrfu, inum, - (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0) - goto fill_failed; - record_end: - furrfu = &lpos; - filp->f_pos = cpos; + fake_offset = 0; + ctx->pos = cpos; goto get_new; end_of_dir: - filp->f_pos = cpos; + ctx->pos = cpos; fill_failed: brelse(bh); if (unicode) @@ -696,10 +696,9 @@ out: return ret; } -static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int fat_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); - return __fat_readdir(inode, filp, dirent, filldir, 0, 0); + return __fat_readdir(file_inode(file), file, ctx, 0, NULL); } #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ @@ -755,20 +754,25 @@ efault: \ FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) -static int fat_ioctl_readdir(struct inode *inode, struct file *filp, +static int fat_ioctl_readdir(struct inode *inode, struct file *file, void __user *dirent, filldir_t filldir, int short_only, int both) { - struct fat_ioctl_filldir_callback buf; + struct fat_ioctl_filldir_callback buf = { + .ctx.actor = filldir, + .dirent = dirent + }; int ret; buf.dirent = dirent; buf.result = 0; mutex_lock(&inode->i_mutex); + buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { - ret = __fat_readdir(inode, filp, &buf, filldir, - short_only, both); + ret = __fat_readdir(inode, file, &buf.ctx, + short_only, both ? &buf : NULL); + file->f_pos = buf.ctx.pos; } mutex_unlock(&inode->i_mutex); if (ret >= 0) @@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, const struct file_operations fat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = fat_readdir, + .iterate = fat_readdir, .unlocked_ioctl = fat_dir_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 21664fc..4241e6f 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -86,6 +86,7 @@ struct msdos_sb_info { const void *dir_ops; /* Opaque; default directory operations */ int dir_per_block; /* dir entries per block */ int dir_per_block_bits; /* log2(dir_per_block) */ + unsigned int vol_id; /*volume ID*/ int fatent_shift; struct fatent_operations *fatent_ops; diff --git a/fs/fat/file.c b/fs/fat/file.c index b0b632e..9b104f5 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -114,6 +114,12 @@ out: return err; } +static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + return put_user(sbi->vol_id, user_attr); +} + long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return fat_ioctl_get_attributes(inode, user_attr); case FAT_IOCTL_SET_ATTRIBUTES: return fat_ioctl_set_attributes(filp, user_attr); + case FAT_IOCTL_GET_VOLUME_ID: + return fat_ioctl_get_volume_id(inode, user_attr); default: return -ENOTTY; /* Inappropriate ioctl for device */ } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index dfce656..11b51bb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1229,6 +1229,19 @@ static int fat_read_root(struct inode *inode) return 0; } +static unsigned long calc_fat_clusters(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + /* Divide first to avoid overflow */ + if (sbi->fat_bits != 12) { + unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits; + return ent_per_sec * sbi->fat_length; + } + + return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; +} + /* * Read the super block of an MS-DOS FS. */ @@ -1402,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, brelse(fsinfo_bh); } + /* interpret volume ID as a little endian 32 bit integer */ + if (sbi->fat_bits == 32) + sbi->vol_id = (((u32)b->fat32.vol_id[0]) | + ((u32)b->fat32.vol_id[1] << 8) | + ((u32)b->fat32.vol_id[2] << 16) | + ((u32)b->fat32.vol_id[3] << 24)); + else /* fat 16 or 12 */ + sbi->vol_id = (((u32)b->fat16.vol_id[0]) | + ((u32)b->fat16.vol_id[1] << 8) | + ((u32)b->fat16.vol_id[2] << 16) | + ((u32)b->fat16.vol_id[3] << 24)); + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; @@ -1434,7 +1459,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; /* check that FAT table does not overflow */ - fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; + fat_clusters = calc_fat_clusters(sb); total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); if (total_clusters > MAX_FAT(sb)) { if (!silent) diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 359d307..628e22a 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); + fat_msg(sb, KERN_ERR, "error, %pV", &vaf); va_end(args); } @@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { sb->s_flags |= MS_RDONLY; - printk(KERN_ERR "FAT-fs (%s): Filesystem has been " - "set read-only\n", sb->s_id); + fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } EXPORT_SYMBOL_GPL(__fat_fs_error); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 081b759..a783b0e 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len, * that the existing dentry can be used. The msdos fs routines will * return ENOENT or EINVAL as appropriate. */ -static int msdos_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int msdos_hash(const struct dentry *dentry, struct qstr *qstr) { struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; unsigned char msdos_name[MSDOS_NAME]; @@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode, * Compare two msdos names. If either of the names are invalid, * we fall back to doing the standard name comparison. */ -static int msdos_cmp(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 2da9520..6df8d3d 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr) * that the existing dentry can be used. The vfat fs routines will * return ENOENT or EINVAL as appropriate. */ -static int vfat_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int vfat_hash(const struct dentry *dentry, struct qstr *qstr) { qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); return 0; @@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode, * that the existing dentry can be used. The vfat fs routines will * return ENOENT or EINVAL as appropriate. */ -static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr) { struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; const unsigned char *name; @@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, /* * Case insensitive compare of two vfat names. */ -static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; @@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, /* * Case sensitive compare of two vfat names. */ -static int vfat_cmp(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { unsigned int alen, blen; diff --git a/fs/file_table.c b/fs/file_table.c index cd4d87a..08e719b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -227,7 +227,7 @@ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; - struct inode *inode = dentry->d_inode; + struct inode *inode = file->f_inode; might_sleep(); @@ -306,17 +306,18 @@ void fput(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; + unsigned long flags; + file_sb_list_del(file); - if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) { - unsigned long flags; - spin_lock_irqsave(&delayed_fput_lock, flags); - list_add(&file->f_u.fu_list, &delayed_fput_list); - schedule_work(&delayed_fput_work); - spin_unlock_irqrestore(&delayed_fput_lock, flags); - return; + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_u.fu_rcuhead, ____fput); + if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) + return; } - init_task_work(&file->f_u.fu_rcuhead, ____fput); - task_work_add(task, &file->f_u.fu_rcuhead, true); + spin_lock_irqsave(&delayed_fput_lock, flags); + list_add(&file->f_u.fu_list, &delayed_fput_list); + schedule_work(&delayed_fput_work); + spin_unlock_irqrestore(&delayed_fput_lock, flags); } } diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 664b07a..25d4099 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -49,7 +49,7 @@ static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int); -static int vxfs_readdir(struct file *, void *, filldir_t); +static int vxfs_readdir(struct file *, struct dir_context *); const struct inode_operations vxfs_dir_inode_ops = { .lookup = vxfs_lookup, @@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = { const struct file_operations vxfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = vxfs_readdir, + .iterate = vxfs_readdir, }; @@ -235,7 +235,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags) * Zero. */ static int -vxfs_readdir(struct file *fp, void *retp, filldir_t filler) +vxfs_readdir(struct file *fp, struct dir_context *ctx) { struct inode *ip = file_inode(fp); struct super_block *sbp = ip->i_sb; @@ -243,20 +243,17 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) u_long page, npages, block, pblocks, nblocks, offset; loff_t pos; - switch ((long)fp->f_pos) { - case 0: - if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) - goto out; - fp->f_pos++; - /* fallthrough */ - case 1: - if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0) - goto out; - fp->f_pos++; - /* fallthrough */ + if (ctx->pos == 0) { + if (!dir_emit_dot(fp, ctx)) + return 0; + ctx->pos = 1; } - - pos = fp->f_pos - 2; + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR)) + return 0; + ctx->pos = 2; + } + pos = ctx->pos - 2; if (pos > VXFS_DIRROUND(ip->i_size)) return 0; @@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks; for (; page < npages; page++, block = 0) { - caddr_t kaddr; + char *kaddr; struct page *pp; pp = vxfs_get_page(ip->i_mapping, page); if (IS_ERR(pp)) continue; - kaddr = (caddr_t)page_address(pp); + kaddr = (char *)page_address(pp); for (; block <= nblocks && block <= pblocks; block++) { - caddr_t baddr, limit; + char *baddr, *limit; struct vxfs_dirblk *dbp; struct vxfs_direct *de; @@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) (kaddr + offset) : (baddr + VXFS_DIRBLKOV(dbp))); - for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) { - int over; - + for (; (char *)de <= limit; de = vxfs_next_entry(de)) { if (!de->d_reclen) break; if (!de->d_ino) continue; - offset = (caddr_t)de - kaddr; - over = filler(retp, de->d_name, de->d_namelen, - ((page << PAGE_CACHE_SHIFT) | offset) + 2, - de->d_ino, DT_UNKNOWN); - if (over) { + offset = (char *)de - kaddr; + ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; + if (!dir_emit(ctx, de->d_name, de->d_namelen, + de->d_ino, DT_UNKNOWN)) { vxfs_put_page(pp); - goto done; + return 0; } } offset = 0; @@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler) vxfs_put_page(pp); offset = 0; } - -done: - fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; -out: + ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2; return 0; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3be5718..68851ff 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -45,6 +45,7 @@ struct wb_writeback_work { unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; + unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -443,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data - * I/O completion. + * I/O completion. We don't do it for sync(2) writeback because it has a + * separate, external IO completion path and ->sync_fs for guaranteeing + * inode metadata is written back correctly. */ - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; @@ -578,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb, .tagged_writepages = work->tagged_writepages, .for_kupdate = work->for_kupdate, .for_background = work->for_background, + .for_sync = work->for_sync, .range_cyclic = work->range_cyclic, .range_start = 0, .range_end = LLONG_MAX, @@ -959,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) /* * Retrieve work items and do the writeback they describe */ -long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +static long wb_do_writeback(struct bdi_writeback *wb) { struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; @@ -967,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) set_bit(BDI_writeback_running, &wb->bdi->state); while ((work = get_next_work_item(bdi)) != NULL) { - /* - * Override sync mode, in case we must wait for completion - * because this thread is exiting now. - */ - if (force_wait) - work->sync_mode = WB_SYNC_ALL; trace_writeback_exec(bdi, work); @@ -1021,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work) * rescuer as work_list needs to be drained. */ do { - pages_written = wb_do_writeback(wb, 0); + pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); } while (!list_empty(&bdi->work_list)); } else { @@ -1362,6 +1360,7 @@ void sync_inodes_sb(struct super_block *sb) .range_cyclic = 0, .done = &done, .reason = WB_REASON_SYNC, + .for_sync = 1, }; /* Nothing to do? */ diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index b52aed1..f7cff36 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object( struct fscache_object, cookie_link); cache = object->cache; - if (object->state >= FSCACHE_OBJECT_DYING || + if (fscache_object_is_dying(object) || test_bit(FSCACHE_IOERROR, &cache->flags)) cache = NULL; @@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache, BUG_ON(!ifsdef); cache->flags = 0; - ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); - ifsdef->state = FSCACHE_OBJECT_ACTIVE; + ifsdef->event_mask = + ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) & + ~(1 << FSCACHE_OBJECT_EV_CLEARED); + __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags); if (!tagname) tagname = cache->identifier; @@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, { struct fscache_object *object; - spin_lock(&cache->object_list_lock); - while (!list_empty(&cache->object_list)) { - object = list_entry(cache->object_list.next, - struct fscache_object, cache_link); - list_move_tail(&object->cache_link, dying_objects); + spin_lock(&cache->object_list_lock); - _debug("withdraw %p", object->cookie); + if (!list_empty(&cache->object_list)) { + object = list_entry(cache->object_list.next, + struct fscache_object, cache_link); + list_move_tail(&object->cache_link, dying_objects); - spin_lock(&object->lock); - spin_unlock(&cache->object_list_lock); - fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW); - spin_unlock(&object->lock); + _debug("withdraw %p", object->cookie); + + /* This must be done under object_list_lock to prevent + * a race with fscache_drop_object(). + */ + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); + } + spin_unlock(&cache->object_list_lock); cond_resched(); - spin_lock(&cache->object_list_lock); } - - spin_unlock(&cache->object_list_lock); } /** diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index e2cba1f..0e91a3c 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie( atomic_set(&cookie->usage, 1); atomic_set(&cookie->n_children, 0); + /* We keep the active count elevated until relinquishment to prevent an + * attempt to wake up every time the object operations queue quiesces. + */ + atomic_set(&cookie->n_active, 1); + atomic_inc(&parent->usage); atomic_inc(&parent->n_children); @@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) cookie->flags = (1 << FSCACHE_COOKIE_LOOKING_UP) | - (1 << FSCACHE_COOKIE_CREATING) | (1 << FSCACHE_COOKIE_NO_DATA_YET); /* ask the cache to allocate objects for this cookie and its parent @@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) /* initiate the process of looking up all the objects in the chain * (done by fscache_initialise_object()) */ - fscache_enqueue_object(object); + fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD); spin_unlock(&cookie->lock); @@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache, object_already_extant: ret = -ENOBUFS; - if (object->state >= FSCACHE_OBJECT_DYING) { + if (fscache_object_is_dead(object)) { spin_unlock(&cookie->lock); goto error; } @@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, ret = -EEXIST; hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { if (p->cache == object->cache) { - if (p->state >= FSCACHE_OBJECT_DYING) + if (fscache_object_is_dying(p)) ret = -ENOBUFS; goto cant_attach_object; } @@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie, hlist_for_each_entry(p, &cookie->parent->backing_objects, cookie_link) { if (p->cache == object->cache) { - if (p->state >= FSCACHE_OBJECT_DYING) { + if (fscache_object_is_dying(p)) { ret = -ENOBUFS; spin_unlock(&cookie->parent->lock); goto cant_attach_object; @@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie) object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); - if (object->state < FSCACHE_OBJECT_DYING) + if (fscache_object_is_live(object)) fscache_raise_event( object, FSCACHE_OBJECT_EV_INVALIDATE); } @@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie); */ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) { - struct fscache_cache *cache; struct fscache_object *object; - unsigned long event; fscache_stat(&fscache_n_relinquishes); if (retire) @@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) return; } - _enter("%p{%s,%p},%d", - cookie, cookie->def->name, cookie->netfs_data, retire); + _enter("%p{%s,%p,%d},%d", + cookie, cookie->def->name, cookie->netfs_data, + atomic_read(&cookie->n_active), retire); + + ASSERTCMP(atomic_read(&cookie->n_active), >, 0); if (atomic_read(&cookie->n_children) != 0) { printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", @@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) BUG(); } - /* wait for the cookie to finish being instantiated (or to fail) */ - if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) { - fscache_stat(&fscache_n_relinquishes_waitcrt); - wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - } - - event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; + /* No further netfs-accessing operations on this cookie permitted */ + set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); + if (retire) + set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); -try_again: spin_lock(&cookie->lock); - - /* break links with all the active objects */ - while (!hlist_empty(&cookie->backing_objects)) { - int n_reads; - object = hlist_entry(cookie->backing_objects.first, - struct fscache_object, - cookie_link); - - _debug("RELEASE OBJ%x", object->debug_id); - - set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags); - n_reads = atomic_read(&object->n_reads); - if (n_reads) { - int n_ops = object->n_ops; - int n_in_progress = object->n_in_progress; - spin_unlock(&cookie->lock); - printk(KERN_ERR "FS-Cache:" - " Cookie '%s' still has %d outstanding reads (%d,%d)\n", - cookie->def->name, - n_reads, n_ops, n_in_progress); - wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); - printk("Wait finished\n"); - goto try_again; - } - - /* detach each cache object from the object cookie */ - spin_lock(&object->lock); - hlist_del_init(&object->cookie_link); - - cache = object->cache; - object->cookie = NULL; - fscache_raise_event(object, event); - spin_unlock(&object->lock); - - if (atomic_dec_and_test(&cookie->usage)) - /* the cookie refcount shouldn't be reduced to 0 yet */ - BUG(); + hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { + fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); } + spin_unlock(&cookie->lock); - /* detach pointers back to the netfs */ + /* Wait for cessation of activity requiring access to the netfs (when + * n_active reaches 0). + */ + if (!atomic_dec_and_test(&cookie->n_active)) + wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, + TASK_UNINTERRUPTIBLE); + + /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; - - spin_unlock(&cookie->lock); + BUG_ON(cookie->stores.rnode); if (cookie->parent) { ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); @@ -553,7 +524,7 @@ try_again: atomic_dec(&cookie->parent->n_children); } - /* finally dispose of the cookie */ + /* Dispose of the netfs's link to the cookie */ ASSERTCMP(atomic_read(&cookie->usage), >, 0); fscache_cookie_put(cookie); diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index f5b4bae..10a2ade 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = { struct fscache_cookie fscache_fsdef_index = { .usage = ATOMIC_INIT(1), + .n_active = ATOMIC_INIT(1), .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), .backing_objects = HLIST_HEAD_INIT, .def = &fscache_fsdef_index_def, diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index ee38fef..12d505b 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void) extern int fscache_wait_bit(void *); extern int fscache_wait_bit_interruptible(void *); +extern int fscache_wait_atomic_t(atomic_t *); /* * object.c */ -extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5]; - -extern void fscache_withdrawing_object(struct fscache_cache *, - struct fscache_object *); extern void fscache_enqueue_object(struct fscache_object *); /* @@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *); extern const struct file_operations fscache_objlist_fops; extern void fscache_objlist_add(struct fscache_object *); +extern void fscache_objlist_remove(struct fscache_object *); #else #define fscache_objlist_add(object) do {} while(0) +#define fscache_objlist_remove(object) do {} while(0) #endif /* @@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object, unsigned event) { BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS); +#if 0 + printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n", + object->debug_id, object->event_mask, (1 << event)); +#endif if (!test_and_set_bit(event, &object->events) && test_bit(event, &object->event_mask)) fscache_enqueue_object(object); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index f9d8567..7c27907 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags) schedule(); return 0; } -EXPORT_SYMBOL(fscache_wait_bit); /* * wait_on_bit() sleep function for interruptible waiting @@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags) schedule(); return signal_pending(current); } -EXPORT_SYMBOL(fscache_wait_bit_interruptible); + +/* + * wait_on_atomic_t() sleep function for uninterruptible waiting + */ +int fscache_wait_atomic_t(atomic_t *p) +{ + schedule(); + return 0; +} diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index e028b8e..b1bb611 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) /* initialise the primary index cookie */ atomic_set(&netfs->primary_index->usage, 1); atomic_set(&netfs->primary_index->n_children, 0); + atomic_set(&netfs->primary_index->n_active, 1); netfs->primary_index->def = &fscache_fsdef_netfs_def; netfs->primary_index->parent = &fscache_fsdef_index; diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index f27c89d..e1959ef 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj) write_unlock(&fscache_object_list_lock); } -/** - * fscache_object_destroy - Note that a cache object is about to be destroyed - * @object: The object to be destroyed - * - * Note the imminent destruction and deallocation of a cache object record. +/* + * Remove an object from the object list. */ -void fscache_object_destroy(struct fscache_object *obj) +void fscache_objlist_remove(struct fscache_object *obj) { write_lock(&fscache_object_list_lock); @@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj) write_unlock(&fscache_object_list_lock); } -EXPORT_SYMBOL(fscache_object_destroy); /* * find the object in the tree on or after the specified index @@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v) { struct fscache_objlist_data *data = m->private; struct fscache_object *obj = v; + struct fscache_cookie *cookie; unsigned long config = data->config; - uint16_t keylen, auxlen; char _type[3], *type; - bool no_cookie; u8 *buf = data->buf, *p; if ((unsigned long) v == 1) { seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" - " EM EV F S" + " EM EV FL S" " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); if (config & (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) @@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) if ((unsigned long) v == 2) { seq_puts(m, "======== ======== ==== ===== === === === == =====" - " == == = =" + " == == == =" " | ================ == == ================"); if (config & (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) @@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v) } \ } while(0) + cookie = obj->cookie; if (~config) { - FILTER(obj->cookie, + FILTER(cookie->def, COOKIE, NOCOOKIE); - FILTER(obj->state != FSCACHE_OBJECT_ACTIVE || + FILTER(fscache_object_is_active(obj) || obj->n_ops != 0 || obj->n_obj_ops != 0 || obj->flags || @@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v) } seq_printf(m, - "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ", + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ", obj->debug_id, obj->parent ? obj->parent->debug_id : -1, - fscache_object_states_short[obj->state], + obj->state->short_name, obj->n_children, obj->n_ops, obj->n_obj_ops, @@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v) obj->flags, work_busy(&obj->work)); - no_cookie = true; - keylen = auxlen = 0; - if (obj->cookie) { - spin_lock(&obj->lock); - if (obj->cookie) { - switch (obj->cookie->def->type) { - case 0: - type = "IX"; - break; - case 1: - type = "DT"; - break; - default: - sprintf(_type, "%02u", - obj->cookie->def->type); - type = _type; - break; - } + if (fscache_use_cookie(obj)) { + uint16_t keylen = 0, auxlen = 0; - seq_printf(m, "%-16s %s %2lx %16p", - obj->cookie->def->name, - type, - obj->cookie->flags, - obj->cookie->netfs_data); - - if (obj->cookie->def->get_key && - config & FSCACHE_OBJLIST_CONFIG_KEY) - keylen = obj->cookie->def->get_key( - obj->cookie->netfs_data, - buf, 400); - - if (obj->cookie->def->get_aux && - config & FSCACHE_OBJLIST_CONFIG_AUX) - auxlen = obj->cookie->def->get_aux( - obj->cookie->netfs_data, - buf + keylen, 512 - keylen); - - no_cookie = false; + switch (cookie->def->type) { + case 0: + type = "IX"; + break; + case 1: + type = "DT"; + break; + default: + sprintf(_type, "%02u", cookie->def->type); + type = _type; + break; } - spin_unlock(&obj->lock); - if (!no_cookie && (keylen > 0 || auxlen > 0)) { + seq_printf(m, "%-16s %s %2lx %16p", + cookie->def->name, + type, + cookie->flags, + cookie->netfs_data); + + if (cookie->def->get_key && + config & FSCACHE_OBJLIST_CONFIG_KEY) + keylen = cookie->def->get_key(cookie->netfs_data, + buf, 400); + + if (cookie->def->get_aux && + config & FSCACHE_OBJLIST_CONFIG_AUX) + auxlen = cookie->def->get_aux(cookie->netfs_data, + buf + keylen, 512 - keylen); + fscache_unuse_cookie(obj); + + if (keylen > 0 || auxlen > 0) { seq_printf(m, " "); for (p = buf; keylen > 0; keylen--) seq_printf(m, "%02x", *p++); @@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v) seq_printf(m, "%02x", *p++); } } - } - if (no_cookie) - seq_printf(m, "<no_cookie>\n"); - else seq_printf(m, "\n"); + } else { + seq_printf(m, "<no_netfs>\n"); + } return 0; } diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 50d41c1..86d75a6 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -15,52 +15,131 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> #include <linux/slab.h> +#include <linux/prefetch.h> #include "internal.h" -const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { - [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", - [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", - [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", - [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", - [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", - [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING", - [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", - [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", - [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", - [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT", - [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING", - [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING", - [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING", - [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD", +static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int); +static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int); +static const struct fscache_state *fscache_drop_object(struct fscache_object *, int); +static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int); +static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int); +static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int); +static const struct fscache_state *fscache_kill_object(struct fscache_object *, int); +static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int); +static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int); +static const struct fscache_state *fscache_object_available(struct fscache_object *, int); +static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); +static const struct fscache_state *fscache_update_object(struct fscache_object *, int); + +#define __STATE_NAME(n) fscache_osm_##n +#define STATE(n) (&__STATE_NAME(n)) + +/* + * Define a work state. Work states are execution states. No event processing + * is performed by them. The function attached to a work state returns a + * pointer indicating the next state to which the state machine should + * transition. Returning NO_TRANSIT repeats the current state, but goes back + * to the scheduler first. + */ +#define WORK_STATE(n, sn, f) \ + const struct fscache_state __STATE_NAME(n) = { \ + .name = #n, \ + .short_name = sn, \ + .work = f \ + } + +/* + * Returns from work states. + */ +#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); }) + +#define NO_TRANSIT ((struct fscache_state *)NULL) + +/* + * Define a wait state. Wait states are event processing states. No execution + * is performed by them. Wait states are just tables of "if event X occurs, + * clear it and transition to state Y". The dispatcher returns to the + * scheduler if none of the events in which the wait state has an interest are + * currently pending. + */ +#define WAIT_STATE(n, sn, ...) \ + const struct fscache_state __STATE_NAME(n) = { \ + .name = #n, \ + .short_name = sn, \ + .work = NULL, \ + .transitions = { __VA_ARGS__, { 0, NULL } } \ + } + +#define TRANSIT_TO(state, emask) \ + { .events = (emask), .transit_to = STATE(state) } + +/* + * The object state machine. + */ +static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); +static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); +static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); +static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); +static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object); +static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); +static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); + +static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object); +static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object); + +static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); +static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); +static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); +static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); +static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL); + +static WAIT_STATE(WAIT_FOR_INIT, "?INI", + TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); + +static WAIT_STATE(WAIT_FOR_PARENT, "?PRN", + TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY)); + +static WAIT_STATE(WAIT_FOR_CMD, "?CMD", + TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE), + TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE), + TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); + +static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR", + TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED)); + +/* + * Out-of-band event transition tables. These are for handling unexpected + * events, such as an I/O error. If an OOB event occurs, the state machine + * clears and disables the event and forces a transition to the nominated work + * state (acurrently executing work states will complete first). + * + * In such a situation, object->state remembers the state the machine should + * have been in/gone to and returning NO_TRANSIT returns to that. + */ +static const struct fscache_transition fscache_osm_init_oob[] = { + TRANSIT_TO(ABORT_INIT, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } +}; + +static const struct fscache_transition fscache_osm_lookup_oob[] = { + TRANSIT_TO(LOOKUP_FAILURE, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } }; -EXPORT_SYMBOL(fscache_object_states); - -const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { - [FSCACHE_OBJECT_INIT] = "INIT", - [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", - [FSCACHE_OBJECT_CREATING] = "CRTN", - [FSCACHE_OBJECT_AVAILABLE] = "AVBL", - [FSCACHE_OBJECT_ACTIVE] = "ACTV", - [FSCACHE_OBJECT_INVALIDATING] = "INVL", - [FSCACHE_OBJECT_UPDATING] = "UPDT", - [FSCACHE_OBJECT_DYING] = "DYNG", - [FSCACHE_OBJECT_LC_DYING] = "LCDY", - [FSCACHE_OBJECT_ABORT_INIT] = "ABTI", - [FSCACHE_OBJECT_RELEASING] = "RELS", - [FSCACHE_OBJECT_RECYCLING] = "RCYC", - [FSCACHE_OBJECT_WITHDRAWING] = "WTHD", - [FSCACHE_OBJECT_DEAD] = "DEAD", + +static const struct fscache_transition fscache_osm_run_oob[] = { + TRANSIT_TO(KILL_OBJECT, + (1 << FSCACHE_OBJECT_EV_ERROR) | + (1 << FSCACHE_OBJECT_EV_KILL)), + { 0, NULL } }; static int fscache_get_object(struct fscache_object *); static void fscache_put_object(struct fscache_object *); -static void fscache_initialise_object(struct fscache_object *); -static void fscache_lookup_object(struct fscache_object *); -static void fscache_object_available(struct fscache_object *); -static void fscache_invalidate_object(struct fscache_object *); -static void fscache_release_object(struct fscache_object *); -static void fscache_withdraw_object(struct fscache_object *); -static void fscache_enqueue_dependents(struct fscache_object *); +static bool fscache_enqueue_dependents(struct fscache_object *, int); static void fscache_dequeue_object(struct fscache_object *); /* @@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object) object->debug_id, parent->debug_id, parent->n_ops); spin_lock_nested(&parent->lock, 1); - parent->n_ops--; parent->n_obj_ops--; + parent->n_ops--; if (parent->n_ops == 0) fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); spin_unlock(&parent->lock); } /* - * Notify netfs of invalidation completion. + * Object state machine dispatcher. */ -static inline void fscache_invalidation_complete(struct fscache_cookie *cookie) +static void fscache_object_sm_dispatcher(struct fscache_object *object) { - if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); -} - -/* - * process events that have been sent to an object's state machine - * - initiates parent lookup - * - does object lookup - * - does object creation - * - does object recycling and retirement - * - does object withdrawal - */ -static void fscache_object_state_machine(struct fscache_object *object) -{ - enum fscache_object_state new_state; - struct fscache_cookie *cookie; - int event; + const struct fscache_transition *t; + const struct fscache_state *state, *new_state; + unsigned long events, event_mask; + int event = -1; ASSERT(object != NULL); _enter("{OBJ%x,%s,%lx}", - object->debug_id, fscache_object_states[object->state], - object->events); - - switch (object->state) { - /* wait for the parent object to become ready */ - case FSCACHE_OBJECT_INIT: - object->event_mask = - FSCACHE_OBJECT_EVENTS_MASK & - ~(1 << FSCACHE_OBJECT_EV_CLEARED); - fscache_initialise_object(object); - goto done; - - /* look up the object metadata on disk */ - case FSCACHE_OBJECT_LOOKING_UP: - fscache_lookup_object(object); - goto lookup_transit; - - /* create the object metadata on disk */ - case FSCACHE_OBJECT_CREATING: - fscache_lookup_object(object); - goto lookup_transit; - - /* handle an object becoming available; start pending - * operations and queue dependent operations for processing */ - case FSCACHE_OBJECT_AVAILABLE: - fscache_object_available(object); - goto active_transit; - - /* normal running state */ - case FSCACHE_OBJECT_ACTIVE: - goto active_transit; - - /* Invalidate an object on disk */ - case FSCACHE_OBJECT_INVALIDATING: - clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events); - fscache_stat(&fscache_n_invalidates_run); - fscache_stat(&fscache_n_cop_invalidate_object); - fscache_invalidate_object(object); - fscache_stat_d(&fscache_n_cop_invalidate_object); - fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); - goto active_transit; - - /* update the object metadata on disk */ - case FSCACHE_OBJECT_UPDATING: - clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); - fscache_stat(&fscache_n_updates_run); - fscache_stat(&fscache_n_cop_update_object); - object->cache->ops->update_object(object); - fscache_stat_d(&fscache_n_cop_update_object); - goto active_transit; - - /* handle an object dying during lookup or creation */ - case FSCACHE_OBJECT_LC_DYING: - object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); - fscache_stat(&fscache_n_cop_lookup_complete); - object->cache->ops->lookup_complete(object); - fscache_stat_d(&fscache_n_cop_lookup_complete); - - spin_lock(&object->lock); - object->state = FSCACHE_OBJECT_DYING; - cookie = object->cookie; - if (cookie) { - if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, - &cookie->flags)) - wake_up_bit(&cookie->flags, - FSCACHE_COOKIE_LOOKING_UP); - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, - &cookie->flags)) - wake_up_bit(&cookie->flags, - FSCACHE_COOKIE_CREATING); + object->debug_id, object->state->name, object->events); + + event_mask = object->event_mask; +restart: + object->event_mask = 0; /* Mask normal event handling */ + state = object->state; +restart_masked: + events = object->events; + + /* Handle any out-of-band events (typically an error) */ + if (events & object->oob_event_mask) { + _debug("{OBJ%x} oob %lx", + object->debug_id, events & object->oob_event_mask); + for (t = object->oob_table; t->events; t++) { + if (events & t->events) { + state = t->transit_to; + ASSERT(state->work != NULL); + event = fls(events & t->events) - 1; + __clear_bit(event, &object->oob_event_mask); + clear_bit(event, &object->events); + goto execute_work_state; + } } - spin_unlock(&object->lock); + } - fscache_done_parent_op(object); + /* Wait states are just transition tables */ + if (!state->work) { + if (events & event_mask) { + for (t = state->transitions; t->events; t++) { + if (events & t->events) { + new_state = t->transit_to; + event = fls(events & t->events) - 1; + clear_bit(event, &object->events); + _debug("{OBJ%x} ev %d: %s -> %s", + object->debug_id, event, + state->name, new_state->name); + object->state = state = new_state; + goto execute_work_state; + } + } - /* wait for completion of all active operations on this object - * and the death of all child objects of this object */ - case FSCACHE_OBJECT_DYING: - dying: - clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events); - spin_lock(&object->lock); - _debug("dying OBJ%x {%d,%d}", - object->debug_id, object->n_ops, object->n_children); - if (object->n_ops == 0 && object->n_children == 0) { - object->event_mask &= - ~(1 << FSCACHE_OBJECT_EV_CLEARED); - object->event_mask |= - (1 << FSCACHE_OBJECT_EV_WITHDRAW) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_ERROR); - } else { - object->event_mask &= - ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_ERROR)); - object->event_mask |= - 1 << FSCACHE_OBJECT_EV_CLEARED; + /* The event mask didn't include all the tabled bits */ + BUG(); } - spin_unlock(&object->lock); - fscache_enqueue_dependents(object); - fscache_start_operations(object); - goto terminal_transit; - - /* handle an abort during initialisation */ - case FSCACHE_OBJECT_ABORT_INIT: - _debug("handle abort init %lx", object->events); - object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); - - spin_lock(&object->lock); - fscache_dequeue_object(object); - - object->state = FSCACHE_OBJECT_DYING; - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, - &object->cookie->flags)) - wake_up_bit(&object->cookie->flags, - FSCACHE_COOKIE_CREATING); - spin_unlock(&object->lock); - goto dying; - - /* handle the netfs releasing an object and possibly marking it - * obsolete too */ - case FSCACHE_OBJECT_RELEASING: - case FSCACHE_OBJECT_RECYCLING: - object->event_mask &= - ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_ERROR)); - fscache_release_object(object); - spin_lock(&object->lock); - object->state = FSCACHE_OBJECT_DEAD; - spin_unlock(&object->lock); - fscache_stat(&fscache_n_object_dead); - goto terminal_transit; - - /* handle the parent cache of this object being withdrawn from - * active service */ - case FSCACHE_OBJECT_WITHDRAWING: - object->event_mask &= - ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_ERROR)); - fscache_withdraw_object(object); - spin_lock(&object->lock); - object->state = FSCACHE_OBJECT_DEAD; - spin_unlock(&object->lock); - fscache_stat(&fscache_n_object_dead); - goto terminal_transit; - - /* complain about the object being woken up once it is - * deceased */ - case FSCACHE_OBJECT_DEAD: - printk(KERN_ERR "FS-Cache:" - " Unexpected event in dead state %lx\n", - object->events & object->event_mask); - BUG(); - - default: - printk(KERN_ERR "FS-Cache: Unknown object state %u\n", - object->state); - BUG(); - } - - /* determine the transition from a lookup state */ -lookup_transit: - event = fls(object->events & object->event_mask) - 1; - switch (event) { - case FSCACHE_OBJECT_EV_WITHDRAW: - case FSCACHE_OBJECT_EV_RETIRE: - case FSCACHE_OBJECT_EV_RELEASE: - case FSCACHE_OBJECT_EV_ERROR: - new_state = FSCACHE_OBJECT_LC_DYING; - goto change_state; - case FSCACHE_OBJECT_EV_INVALIDATE: - new_state = FSCACHE_OBJECT_INVALIDATING; - goto change_state; - case FSCACHE_OBJECT_EV_REQUEUE: - goto done; - case -1: - goto done; /* sleep until event */ - default: - goto unsupported_event; + /* Randomly woke up */ + goto unmask_events; } - /* determine the transition from an active state */ -active_transit: - event = fls(object->events & object->event_mask) - 1; - switch (event) { - case FSCACHE_OBJECT_EV_WITHDRAW: - case FSCACHE_OBJECT_EV_RETIRE: - case FSCACHE_OBJECT_EV_RELEASE: - case FSCACHE_OBJECT_EV_ERROR: - new_state = FSCACHE_OBJECT_DYING; - goto change_state; - case FSCACHE_OBJECT_EV_INVALIDATE: - new_state = FSCACHE_OBJECT_INVALIDATING; - goto change_state; - case FSCACHE_OBJECT_EV_UPDATE: - new_state = FSCACHE_OBJECT_UPDATING; - goto change_state; - case -1: - new_state = FSCACHE_OBJECT_ACTIVE; - goto change_state; /* sleep until event */ - default: - goto unsupported_event; - } +execute_work_state: + _debug("{OBJ%x} exec %s", object->debug_id, state->name); - /* determine the transition from a terminal state */ -terminal_transit: - event = fls(object->events & object->event_mask) - 1; - switch (event) { - case FSCACHE_OBJECT_EV_WITHDRAW: - new_state = FSCACHE_OBJECT_WITHDRAWING; - goto change_state; - case FSCACHE_OBJECT_EV_RETIRE: - new_state = FSCACHE_OBJECT_RECYCLING; - goto change_state; - case FSCACHE_OBJECT_EV_RELEASE: - new_state = FSCACHE_OBJECT_RELEASING; - goto change_state; - case FSCACHE_OBJECT_EV_ERROR: - new_state = FSCACHE_OBJECT_WITHDRAWING; - goto change_state; - case FSCACHE_OBJECT_EV_CLEARED: - new_state = FSCACHE_OBJECT_DYING; - goto change_state; - case -1: - goto done; /* sleep until event */ - default: - goto unsupported_event; + new_state = state->work(object, event); + event = -1; + if (new_state == NO_TRANSIT) { + _debug("{OBJ%x} %s notrans", object->debug_id, state->name); + fscache_enqueue_object(object); + event_mask = object->oob_event_mask; + goto unmask_events; } -change_state: - spin_lock(&object->lock); - object->state = new_state; - spin_unlock(&object->lock); + _debug("{OBJ%x} %s -> %s", + object->debug_id, state->name, new_state->name); + object->state = state = new_state; -done: - _leave(" [->%s]", fscache_object_states[object->state]); - return; + if (state->work) { + if (unlikely(state->work == ((void *)2UL))) { + _leave(" [dead]"); + return; + } + goto restart_masked; + } -unsupported_event: - printk(KERN_ERR "FS-Cache:" - " Unsupported event %d [%lx/%lx] in state %s\n", - event, object->events, object->event_mask, - fscache_object_states[object->state]); - BUG(); + /* Transited to wait state */ + event_mask = object->oob_event_mask; + for (t = state->transitions; t->events; t++) + event_mask |= t->events; + +unmask_events: + object->event_mask = event_mask; + smp_mb(); + events = object->events; + if (events & event_mask) + goto restart; + _leave(" [msk %lx]", event_mask); } /* * execute an object */ -void fscache_object_work_func(struct work_struct *work) +static void fscache_object_work_func(struct work_struct *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); @@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work) _enter("{OBJ%x}", object->debug_id); start = jiffies; - fscache_object_state_machine(object); + fscache_object_sm_dispatcher(object); fscache_hist(fscache_objs_histogram, start); - if (object->events & object->event_mask) - fscache_enqueue_object(object); - clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); fscache_put_object(object); } -EXPORT_SYMBOL(fscache_object_work_func); + +/** + * fscache_object_init - Initialise a cache object description + * @object: Object description + * @cookie: Cookie object will be attached to + * @cache: Cache in which backing object will be found + * + * Initialise a cache object description to its basic values. + * + * See Documentation/filesystems/caching/backend-api.txt for a complete + * description. + */ +void fscache_object_init(struct fscache_object *object, + struct fscache_cookie *cookie, + struct fscache_cache *cache) +{ + const struct fscache_transition *t; + + atomic_inc(&cache->object_count); + + object->state = STATE(WAIT_FOR_INIT); + object->oob_table = fscache_osm_init_oob; + object->flags = 1 << FSCACHE_OBJECT_IS_LIVE; + spin_lock_init(&object->lock); + INIT_LIST_HEAD(&object->cache_link); + INIT_HLIST_NODE(&object->cookie_link); + INIT_WORK(&object->work, fscache_object_work_func); + INIT_LIST_HEAD(&object->dependents); + INIT_LIST_HEAD(&object->dep_link); + INIT_LIST_HEAD(&object->pending_ops); + object->n_children = 0; + object->n_ops = object->n_in_progress = object->n_exclusive = 0; + object->events = 0; + object->store_limit = 0; + object->store_limit_l = 0; + object->cache = cache; + object->cookie = cookie; + object->parent = NULL; + + object->oob_event_mask = 0; + for (t = object->oob_table; t->events; t++) + object->oob_event_mask |= t->events; + object->event_mask = object->oob_event_mask; + for (t = object->state->transitions; t->events; t++) + object->event_mask |= t->events; +} +EXPORT_SYMBOL(fscache_object_init); + +/* + * Abort object initialisation before we start it. + */ +static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + object->oob_event_mask = 0; + fscache_dequeue_object(object); + return transit_to(KILL_OBJECT); +} /* * initialise an object @@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func); * immediately to do a creation * - we may need to start the process of creating a parent and we need to wait * for the parent's lookup and creation to complete if it's not there yet - * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the - * leaf-most cookies of the object and all its children */ -static void fscache_initialise_object(struct fscache_object *object) +static const struct fscache_state *fscache_initialise_object(struct fscache_object *object, + int event) { struct fscache_object *parent; + bool success; - _enter(""); - ASSERT(object->cookie != NULL); - ASSERT(object->cookie->parent != NULL); - - if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | - (1 << FSCACHE_OBJECT_EV_RELEASE) | - (1 << FSCACHE_OBJECT_EV_RETIRE) | - (1 << FSCACHE_OBJECT_EV_WITHDRAW))) { - _debug("abort init %lx", object->events); - spin_lock(&object->lock); - object->state = FSCACHE_OBJECT_ABORT_INIT; - spin_unlock(&object->lock); - return; - } + _enter("{OBJ%x},%d", object->debug_id, event); - spin_lock(&object->cookie->lock); - spin_lock_nested(&object->cookie->parent->lock, 1); + ASSERT(list_empty(&object->dep_link)); parent = object->parent; if (!parent) { - _debug("no parent"); - set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); - } else { - spin_lock(&object->lock); - spin_lock_nested(&parent->lock, 1); - _debug("parent %s", fscache_object_states[parent->state]); - - if (parent->state >= FSCACHE_OBJECT_DYING) { - _debug("bad parent"); - set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); - } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) { - _debug("wait"); - - /* we may get woken up in this state by child objects - * binding on to us, so we need to make sure we don't - * add ourself to the list multiple times */ - if (list_empty(&object->dep_link)) { - fscache_stat(&fscache_n_cop_grab_object); - object->cache->ops->grab_object(object); - fscache_stat_d(&fscache_n_cop_grab_object); - list_add(&object->dep_link, - &parent->dependents); - - /* fscache_acquire_non_index_cookie() uses this - * to wake the chain up */ - if (parent->state == FSCACHE_OBJECT_INIT) - fscache_enqueue_object(parent); - } - } else { - _debug("go"); - parent->n_ops++; - parent->n_obj_ops++; - object->lookup_jif = jiffies; - object->state = FSCACHE_OBJECT_LOOKING_UP; - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); - } + _leave(" [no parent]"); + return transit_to(DROP_OBJECT); + } - spin_unlock(&parent->lock); - spin_unlock(&object->lock); + _debug("parent: %s of:%lx", parent->state->name, parent->flags); + + if (fscache_object_is_dying(parent)) { + _leave(" [bad parent]"); + return transit_to(DROP_OBJECT); } - spin_unlock(&object->cookie->parent->lock); - spin_unlock(&object->cookie->lock); + if (fscache_object_is_available(parent)) { + _leave(" [ready]"); + return transit_to(PARENT_READY); + } + + _debug("wait"); + + spin_lock(&parent->lock); + fscache_stat(&fscache_n_cop_grab_object); + success = false; + if (fscache_object_is_live(parent) && + object->cache->ops->grab_object(object)) { + list_add(&object->dep_link, &parent->dependents); + success = true; + } + fscache_stat_d(&fscache_n_cop_grab_object); + spin_unlock(&parent->lock); + if (!success) { + _leave(" [grab failed]"); + return transit_to(DROP_OBJECT); + } + + /* fscache_acquire_non_index_cookie() uses this + * to wake the chain up */ + fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD); + _leave(" [wait]"); + return transit_to(WAIT_FOR_PARENT); +} + +/* + * Once the parent object is ready, we should kick off our lookup op. + */ +static const struct fscache_state *fscache_parent_ready(struct fscache_object *object, + int event) +{ + struct fscache_object *parent = object->parent; + + _enter("{OBJ%x},%d", object->debug_id, event); + + ASSERT(parent != NULL); + + spin_lock(&parent->lock); + parent->n_ops++; + parent->n_obj_ops++; + object->lookup_jif = jiffies; + spin_unlock(&parent->lock); + _leave(""); + return transit_to(LOOK_UP_OBJECT); } /* * look an object up in the cache from which it was allocated * - we hold an "access lock" on the parent object, so the parent object cannot * be withdrawn by either party till we've finished - * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the - * leaf-most cookies of the object and all its children */ -static void fscache_lookup_object(struct fscache_object *object) +static const struct fscache_state *fscache_look_up_object(struct fscache_object *object, + int event) { struct fscache_cookie *cookie = object->cookie; - struct fscache_object *parent; + struct fscache_object *parent = object->parent; int ret; - _enter(""); + _enter("{OBJ%x},%d", object->debug_id, event); + + object->oob_table = fscache_osm_lookup_oob; - parent = object->parent; ASSERT(parent != NULL); ASSERTCMP(parent->n_ops, >, 0); ASSERTCMP(parent->n_obj_ops, >, 0); /* make sure the parent is still available */ - ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE); - - if (parent->state >= FSCACHE_OBJECT_DYING || - test_bit(FSCACHE_IOERROR, &object->cache->flags)) { - _debug("unavailable"); - set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); - _leave(""); - return; + ASSERT(fscache_object_is_available(parent)); + + if (fscache_object_is_dying(parent) || + test_bit(FSCACHE_IOERROR, &object->cache->flags) || + !fscache_use_cookie(object)) { + _leave(" [unavailable]"); + return transit_to(LOOKUP_FAILURE); } - _debug("LOOKUP \"%s/%s\" in \"%s\"", - parent->cookie->def->name, cookie->def->name, - object->cache->tag->name); + _debug("LOOKUP \"%s\" in \"%s\"", + cookie->def->name, object->cache->tag->name); fscache_stat(&fscache_n_object_lookups); fscache_stat(&fscache_n_cop_lookup_object); ret = object->cache->ops->lookup_object(object); fscache_stat_d(&fscache_n_cop_lookup_object); - if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) - set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + fscache_unuse_cookie(object); if (ret == -ETIMEDOUT) { /* probably stuck behind another object, so move this one to * the back of the queue */ fscache_stat(&fscache_n_object_lookups_timed_out); - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + _leave(" [timeout]"); + return NO_TRANSIT; } - _leave(""); + if (ret < 0) { + _leave(" [error]"); + return transit_to(LOOKUP_FAILURE); + } + + _leave(" [ok]"); + return transit_to(OBJECT_AVAILABLE); } /** @@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object) { struct fscache_cookie *cookie = object->cookie; - _enter("{OBJ%x,%s}", - object->debug_id, fscache_object_states[object->state]); + _enter("{OBJ%x,%s}", object->debug_id, object->state->name); - spin_lock(&object->lock); - if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { fscache_stat(&fscache_n_object_lookups_negative); - /* transit here to allow write requests to begin stacking up - * and read requests to begin returning ENODATA */ - object->state = FSCACHE_OBJECT_CREATING; - spin_unlock(&object->lock); - - set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags); + /* Allow write requests to begin stacking up and read requests to begin + * returning ENODATA. + */ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); _debug("wake up lookup %p", &cookie->flags); - smp_mb__before_clear_bit(); - clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - smp_mb__after_clear_bit(); + clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); - } else { - ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); - spin_unlock(&object->lock); } - _leave(""); } EXPORT_SYMBOL(fscache_object_lookup_negative); @@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object) { struct fscache_cookie *cookie = object->cookie; - _enter("{OBJ%x,%s}", - object->debug_id, fscache_object_states[object->state]); + _enter("{OBJ%x,%s}", object->debug_id, object->state->name); /* if we were still looking up, then we must have a positive lookup * result, in which case there may be data available */ - spin_lock(&object->lock); - if (object->state == FSCACHE_OBJECT_LOOKING_UP) { + if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { fscache_stat(&fscache_n_object_lookups_positive); - clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); + /* We do (presumably) have data */ + clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); - object->state = FSCACHE_OBJECT_AVAILABLE; - spin_unlock(&object->lock); - - smp_mb__before_clear_bit(); - clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); - smp_mb__after_clear_bit(); + /* Allow write requests to begin stacking up and read requests + * to begin shovelling data. + */ + clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); } else { - ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING); fscache_stat(&fscache_n_object_created); - - object->state = FSCACHE_OBJECT_AVAILABLE; - spin_unlock(&object->lock); - set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); - smp_wmb(); } - if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) - wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING); - + set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); _leave(""); } EXPORT_SYMBOL(fscache_obtained_object); @@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object); /* * handle an object that has just become available */ -static void fscache_object_available(struct fscache_object *object) +static const struct fscache_state *fscache_object_available(struct fscache_object *object, + int event) { - _enter("{OBJ%x}", object->debug_id); + _enter("{OBJ%x},%d", object->debug_id, event); - spin_lock(&object->lock); + object->oob_table = fscache_osm_run_oob; - if (object->cookie && - test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) - wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); + spin_lock(&object->lock); fscache_done_parent_op(object); if (object->n_in_progress == 0) { @@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object) fscache_stat(&fscache_n_cop_lookup_complete); object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); - fscache_enqueue_dependents(object); fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); fscache_stat(&fscache_n_object_avail); _leave(""); + return transit_to(JUMPSTART_DEPS); } /* - * drop an object's attachments + * Wake up this object's dependent objects now that we've become available. */ -static void fscache_drop_object(struct fscache_object *object) +static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object, + int event) { - struct fscache_object *parent = object->parent; - struct fscache_cache *cache = object->cache; + _enter("{OBJ%x},%d", object->debug_id, event); - _enter("{OBJ%x,%d}", object->debug_id, object->n_children); + if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY)) + return NO_TRANSIT; /* Not finished; requeue */ + return transit_to(WAIT_FOR_CMD); +} - ASSERTCMP(object->cookie, ==, NULL); - ASSERT(hlist_unhashed(&object->cookie_link)); +/* + * Handle lookup or creation failute. + */ +static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object, + int event) +{ + struct fscache_cookie *cookie; - spin_lock(&cache->object_list_lock); - list_del_init(&object->cache_link); - spin_unlock(&cache->object_list_lock); + _enter("{OBJ%x},%d", object->debug_id, event); - fscache_stat(&fscache_n_cop_drop_object); - cache->ops->drop_object(object); - fscache_stat_d(&fscache_n_cop_drop_object); + object->oob_event_mask = 0; - if (parent) { - _debug("release parent OBJ%x {%d}", - parent->debug_id, parent->n_children); + fscache_stat(&fscache_n_cop_lookup_complete); + object->cache->ops->lookup_complete(object); + fscache_stat_d(&fscache_n_cop_lookup_complete); - spin_lock(&parent->lock); - parent->n_children--; - if (parent->n_children == 0) - fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); - spin_unlock(&parent->lock); - object->parent = NULL; + cookie = object->cookie; + set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); + if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); + + fscache_done_parent_op(object); + return transit_to(KILL_OBJECT); +} + +/* + * Wait for completion of all active operations on this object and the death of + * all child objects of this object. + */ +static const struct fscache_state *fscache_kill_object(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x,%d,%d},%d", + object->debug_id, object->n_ops, object->n_children, event); + + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + object->oob_event_mask = 0; + + if (list_empty(&object->dependents) && + object->n_ops == 0 && + object->n_children == 0) + return transit_to(DROP_OBJECT); + + if (object->n_in_progress == 0) { + spin_lock(&object->lock); + if (object->n_ops > 0 && object->n_in_progress == 0) + fscache_start_operations(object); + spin_unlock(&object->lock); } - /* this just shifts the object release to the work processor */ - fscache_put_object(object); + if (!list_empty(&object->dependents)) + return transit_to(KILL_DEPENDENTS); - _leave(""); + return transit_to(WAIT_FOR_CLEARANCE); } /* - * release or recycle an object that the netfs has discarded + * Kill dependent objects. */ -static void fscache_release_object(struct fscache_object *object) +static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object, + int event) { - _enter(""); + _enter("{OBJ%x},%d", object->debug_id, event); - fscache_drop_object(object); + if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL)) + return NO_TRANSIT; /* Not finished */ + return transit_to(WAIT_FOR_CLEARANCE); } /* - * withdraw an object from active service + * Drop an object's attachments */ -static void fscache_withdraw_object(struct fscache_object *object) +static const struct fscache_state *fscache_drop_object(struct fscache_object *object, + int event) { - struct fscache_cookie *cookie; - bool detached; + struct fscache_object *parent = object->parent; + struct fscache_cookie *cookie = object->cookie; + struct fscache_cache *cache = object->cache; + bool awaken = false; - _enter(""); + _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event); - spin_lock(&object->lock); - cookie = object->cookie; - if (cookie) { - /* need to get the cookie lock before the object lock, starting - * from the object pointer */ - atomic_inc(&cookie->usage); - spin_unlock(&object->lock); + ASSERT(cookie != NULL); + ASSERT(!hlist_unhashed(&object->cookie_link)); - detached = false; - spin_lock(&cookie->lock); - spin_lock(&object->lock); + /* Make sure the cookie no longer points here and that the netfs isn't + * waiting for us. + */ + spin_lock(&cookie->lock); + hlist_del_init(&object->cookie_link); + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + awaken = true; + spin_unlock(&cookie->lock); - if (object->cookie == cookie) { - hlist_del_init(&object->cookie_link); - object->cookie = NULL; - fscache_invalidation_complete(cookie); - detached = true; - } - spin_unlock(&cookie->lock); - fscache_cookie_put(cookie); - if (detached) - fscache_cookie_put(cookie); - } + if (awaken) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + /* Prevent a race with our last child, which has to signal EV_CLEARED + * before dropping our spinlock. + */ + spin_lock(&object->lock); spin_unlock(&object->lock); - fscache_drop_object(object); -} + /* Discard from the cache's collection of objects */ + spin_lock(&cache->object_list_lock); + list_del_init(&object->cache_link); + spin_unlock(&cache->object_list_lock); -/* - * withdraw an object from active service at the behest of the cache - * - need break the links to a cached object cookie - * - called under two situations: - * (1) recycler decides to reclaim an in-use object - * (2) a cache is unmounted - * - have to take care as the cookie can be being relinquished by the netfs - * simultaneously - * - the object is pinned by the caller holding a refcount on it - */ -void fscache_withdrawing_object(struct fscache_cache *cache, - struct fscache_object *object) -{ - bool enqueue = false; + fscache_stat(&fscache_n_cop_drop_object); + cache->ops->drop_object(object); + fscache_stat_d(&fscache_n_cop_drop_object); - _enter(",OBJ%x", object->debug_id); + /* The parent object wants to know when all it dependents have gone */ + if (parent) { + _debug("release parent OBJ%x {%d}", + parent->debug_id, parent->n_children); - spin_lock(&object->lock); - if (object->state < FSCACHE_OBJECT_WITHDRAWING) { - object->state = FSCACHE_OBJECT_WITHDRAWING; - enqueue = true; + spin_lock(&parent->lock); + parent->n_children--; + if (parent->n_children == 0) + fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); + spin_unlock(&parent->lock); + object->parent = NULL; } - spin_unlock(&object->lock); - if (enqueue) - fscache_enqueue_object(object); + /* this just shifts the object release to the work processor */ + fscache_put_object(object); + fscache_stat(&fscache_n_object_dead); _leave(""); + return transit_to(OBJECT_DEAD); } /* @@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object) } /* - * discard a ref on a work item + * Discard a ref on an object */ static void fscache_put_object(struct fscache_object *object) { @@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object) fscache_stat_d(&fscache_n_cop_put_object); } +/** + * fscache_object_destroy - Note that a cache object is about to be destroyed + * @object: The object to be destroyed + * + * Note the imminent destruction and deallocation of a cache object record. + */ +void fscache_object_destroy(struct fscache_object *object) +{ + fscache_objlist_remove(object); + + /* We can get rid of the cookie now */ + fscache_cookie_put(object->cookie); + object->cookie = NULL; +} +EXPORT_SYMBOL(fscache_object_destroy); + /* * enqueue an object for metadata-type processing */ @@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object) /** * fscache_object_sleep_till_congested - Sleep until object wq is congested - * @timoutp: Scheduler sleep timeout + * @timeoutp: Scheduler sleep timeout * * Allow an object handler to sleep until the object workqueue is congested. * @@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp) EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); /* - * enqueue the dependents of an object for metadata-type processing - * - the caller must hold the object's lock - * - this may cause an already locked object to wind up being processed again + * Enqueue the dependents of an object for metadata-type processing. + * + * If we don't manage to finish the list before the scheduler wants to run + * again then return false immediately. We return true if the list was + * cleared. */ -static void fscache_enqueue_dependents(struct fscache_object *object) +static bool fscache_enqueue_dependents(struct fscache_object *object, int event) { struct fscache_object *dep; + bool ret = true; _enter("{OBJ%x}", object->debug_id); if (list_empty(&object->dependents)) - return; + return true; spin_lock(&object->lock); @@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object) struct fscache_object, dep_link); list_del_init(&dep->dep_link); - - /* sort onto appropriate lists */ - fscache_enqueue_object(dep); + fscache_raise_event(dep, event); fscache_put_object(dep); - if (!list_empty(&object->dependents)) - cond_resched_lock(&object->lock); + if (!list_empty(&object->dependents) && need_resched()) { + ret = false; + break; + } } spin_unlock(&object->lock); + return ret; } /* * remove an object from whatever queue it's waiting on - * - the caller must hold object->lock */ -void fscache_dequeue_object(struct fscache_object *object) +static void fscache_dequeue_object(struct fscache_object *object) { _enter("{OBJ%x}", object->debug_id); @@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object) * @data: The auxiliary data for the object * @datalen: The size of the auxiliary data * - * This function consults the netfs about the coherency state of an object + * This function consults the netfs about the coherency state of an object. + * The caller must be holding a ref on cookie->n_active (held by + * fscache_look_up_object() on behalf of the cache backend during object lookup + * and creation). */ enum fscache_checkaux fscache_check_aux(struct fscache_object *object, const void *data, uint16_t datalen) @@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux); /* * Asynchronously invalidate an object. */ -static void fscache_invalidate_object(struct fscache_object *object) +static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object, + int event) { struct fscache_operation *op; struct fscache_cookie *cookie = object->cookie; - _enter("{OBJ%x}", object->debug_id); + _enter("{OBJ%x},%d", object->debug_id, event); + + /* We're going to need the cookie. If the cookie is not available then + * retire the object instead. + */ + if (!fscache_use_cookie(object)) { + ASSERT(object->cookie->stores.rnode == NULL); + set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags); + _leave(" [no cookie]"); + return transit_to(KILL_OBJECT); + } /* Reject any new read/write ops and abort any that are pending. */ fscache_invalidate_writes(cookie); @@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object) /* Now we have to wait for in-progress reads and writes */ op = kzalloc(sizeof(*op), GFP_KERNEL); - if (!op) { - fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); - _leave(" [ENOMEM]"); - return; - } + if (!op) + goto nomem; fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); - op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); + op->flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_EXCLUSIVE) | + (1 << FSCACHE_OP_UNUSE_COOKIE); spin_lock(&cookie->lock); if (fscache_submit_exclusive_op(object, op) < 0) @@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object) /* We can allow read and write requests to come in once again. They'll * queue up behind our exclusive invalidation operation. */ - fscache_invalidation_complete(cookie); - _leave(""); - return; + if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) + wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); + _leave(" [ok]"); + return transit_to(UPDATE_OBJECT); + +nomem: + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_unuse_cookie(object); + _leave(" [ENOMEM]"); + return transit_to(KILL_OBJECT); submit_op_failed: + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); spin_unlock(&cookie->lock); kfree(op); - fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); _leave(" [EIO]"); + return transit_to(KILL_OBJECT); +} + +static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object, + int event) +{ + const struct fscache_state *s; + + fscache_stat(&fscache_n_invalidates_run); + fscache_stat(&fscache_n_cop_invalidate_object); + s = _fscache_invalidate_object(object, event); + fscache_stat_d(&fscache_n_cop_invalidate_object); + return s; +} + +/* + * Asynchronously update an object. + */ +static const struct fscache_state *fscache_update_object(struct fscache_object *object, + int event) +{ + _enter("{OBJ%x},%d", object->debug_id, event); + + fscache_stat(&fscache_n_updates_run); + fscache_stat(&fscache_n_cop_update_object); + object->cache->ops->update_object(object); + fscache_stat_d(&fscache_n_cop_update_object); + + _leave(""); + return transit_to(WAIT_FOR_CMD); } diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 762a9ec..318071a 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op) ASSERT(list_empty(&op->pend_link)); ASSERT(op->processor != NULL); - ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); + ASSERT(fscache_object_is_available(op->object)); ASSERTCMP(atomic_read(&op->usage), >, 0); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); @@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); ret = 0; - } else if (object->state == FSCACHE_OBJECT_CREATING) { + } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, */ static void fscache_report_unexpected_submission(struct fscache_object *object, struct fscache_operation *op, - unsigned long ostate) + const struct fscache_state *ostate) { static bool once_only; struct fscache_operation *p; @@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object, once_only = true; kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, - fscache_object_states[object->state]); - kdebug("objstate=%s [%s]", - fscache_object_states[object->state], - fscache_object_states[ostate]); + op->debug_id, object->debug_id, object->state->name); + kdebug("objstate=%s [%s]", object->state->name, ostate->name); kdebug("objflags=%lx", object->flags); kdebug("objevent=%lx [%lx]", object->events, object->event_mask); kdebug("ops=%u inp=%u exc=%u", @@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object, int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op) { - unsigned long ostate; + const struct fscache_state *ostate; int ret; _enter("{OBJ%x OP%x},{%u}", @@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object, fscache_run_op(object, op); } ret = 0; - } else if (object->state == FSCACHE_OBJECT_CREATING) { + } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { op->object = object; object->n_ops++; atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; - } else if (object->state == FSCACHE_OBJECT_DYING || - object->state == FSCACHE_OBJECT_LC_DYING || - object->state == FSCACHE_OBJECT_WITHDRAWING) { + } else if (fscache_object_is_dying(object)) { fscache_stat(&fscache_n_op_rejected); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; @@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object) } /* - * jump start the operation processing on an object - * - caller must hold object->lock + * Jump start the operation processing on an object. The caller must hold + * object->lock. */ void fscache_start_operations(struct fscache_object *object) { @@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op) object = op->object; - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) { - if (atomic_dec_and_test(&object->n_reads)) { - clear_bit(FSCACHE_COOKIE_WAITING_ON_READS, - &object->cookie->flags); - wake_up_bit(&object->cookie->flags, - FSCACHE_COOKIE_WAITING_ON_READS); - } - } + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) + fscache_unuse_cookie(object); /* now... we may get called with the object spinlock held, so we * complete the cleanup here only if we can immediately acquire the diff --git a/fs/fscache/page.c b/fs/fscache/page.c index ff000e5..d479ab3 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -109,7 +109,7 @@ page_busy: * allocator as the work threads writing to the cache may all end up * sleeping on memory allocation, so we may need to impose a timeout * too. */ - if (!(gfp & __GFP_WAIT)) { + if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) { fscache_stat(&fscache_n_store_vmscan_busy); return false; } @@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat(&fscache_n_attr_changed_calls); - if (fscache_object_is_active(object)) { + if (fscache_object_is_active(object) && + fscache_use_cookie(object)) { fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); fscache_stat_d(&fscache_n_cop_attr_changed); + fscache_unuse_cookie(object); if (ret < 0) fscache_abort_object(object); } @@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); - ASSERTCMP(op->n_pages, ==, 0); + ASSERTCMP(atomic_read(&op->n_pages), ==, 0); fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) @@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) * allocate a retrieval op */ static struct fscache_retrieval *fscache_alloc_retrieval( + struct fscache_cookie *cookie, struct address_space *mapping, fscache_rw_complete_t end_io_func, void *context) @@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval( } fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); - op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); + atomic_inc(&cookie->n_active); + op->op.flags = FSCACHE_OP_MYTHREAD | + (1UL << FSCACHE_OP_WAITING) | + (1UL << FSCACHE_OP_UNUSE_COOKIE); op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; @@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op) struct fscache_retrieval *op = container_of(_op, struct fscache_retrieval, op); - op->n_pages = 0; + atomic_set(&op->n_pages, 0); } /* @@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; - op = fscache_alloc_retrieval(page->mapping, end_io_func, context); + op = fscache_alloc_retrieval(cookie, page->mapping, + end_io_func,context); if (!op) { _leave(" = -ENOMEM"); return -ENOMEM; } - op->n_pages = 1; + atomic_set(&op->n_pages, 1); spin_lock(&cookie->lock); @@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); - ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); + ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)); atomic_inc(&object->n_reads); __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); @@ -465,6 +472,7 @@ nobufs_unlock_dec: atomic_dec(&object->n_reads); nobufs_unlock: spin_unlock(&cookie->lock); + atomic_dec(&cookie->n_active); kfree(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); @@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; - op = fscache_alloc_retrieval(mapping, end_io_func, context); + op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context); if (!op) return -ENOMEM; - op->n_pages = *nr_pages; + atomic_set(&op->n_pages, *nr_pages); spin_lock(&cookie->lock); @@ -589,6 +597,7 @@ nobufs_unlock_dec: atomic_dec(&object->n_reads); nobufs_unlock: spin_unlock(&cookie->lock); + atomic_dec(&cookie->n_active); kfree(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); @@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; - op = fscache_alloc_retrieval(page->mapping, NULL, NULL); + op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL); if (!op) return -ENOMEM; - op->n_pages = 1; + atomic_set(&op->n_pages, 1); spin_lock(&cookie->lock); @@ -675,6 +684,7 @@ error: nobufs_unlock: spin_unlock(&cookie->lock); + atomic_dec(&cookie->n_active); kfree(op); nobufs: fscache_stat(&fscache_n_allocs_nobufs); @@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op) */ spin_unlock(&object->lock); fscache_op_complete(&op->op, false); - _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}", - _op->flags, _op->state, object->state, object->flags); + _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}", + _op->flags, _op->state, object->state->short_name, + object->flags); return; } @@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) _enter(""); - while (spin_lock(&cookie->stores_lock), - n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, - ARRAY_SIZE(results), - FSCACHE_COOKIE_PENDING_TAG), - n > 0) { + for (;;) { + spin_lock(&cookie->stores_lock); + n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, + ARRAY_SIZE(results), + FSCACHE_COOKIE_PENDING_TAG); + if (n == 0) { + spin_unlock(&cookie->stores_lock); + break; + } + for (i = n - 1; i >= 0; i--) { page = results[i]; radix_tree_delete(&cookie->stores, page->index); @@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) page_cache_release(results[i]); } - spin_unlock(&cookie->stores_lock); _leave(""); } @@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is * set) * - * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred - * fill op) + * (a) no writes yet * * (b) writes deferred till post-creation (mark page for writing and * return immediately) * * (2) negative lookup, object created, initial fill being made from netfs - * (FSCACHE_COOKIE_INITIAL_FILL is set) * * (a) fill point not yet reached this page (mark page for writing and * return) @@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_operation_init(&op->op, fscache_write_op, fscache_release_write_op); - op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); + op->op.flags = FSCACHE_OP_ASYNC | + (1 << FSCACHE_OP_WAITING) | + (1 << FSCACHE_OP_UNUSE_COOKIE); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) @@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); op->store_limit = object->store_limit; + atomic_inc(&cookie->n_active); if (fscache_submit_op(object, &op->op) < 0) goto submit_failed; @@ -945,6 +961,7 @@ already_pending: return 0; submit_failed: + atomic_dec(&cookie->n_active); spin_lock(&cookie->stores_lock); radix_tree_delete(&cookie->stores, page->index); spin_unlock(&cookie->stores_lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 254df56..0eda527 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,7 +14,7 @@ #include <linux/namei.h> #include <linux/slab.h> -static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_inode *fi = get_fuse_inode(dir); @@ -25,7 +25,7 @@ static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) return true; if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) return true; - if (filp->f_pos == 0) + if (ctx->pos == 0) return true; return false; } @@ -180,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc) static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; + struct dentry *parent; + struct fuse_conn *fc; inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) @@ -187,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) else if (fuse_dentry_time(entry) < get_jiffies_64()) { int err; struct fuse_entry_out outarg; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_forget_link *forget; - struct dentry *parent; u64 attr_version; /* For negative dentries, always do a fresh lookup */ @@ -241,8 +241,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) entry_attr_timeout(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); + } else if (inode) { + fc = get_fuse_conn(inode); + if (fc->readdirplus_auto) { + parent = dget_parent(entry); + fuse_advise_use_readdirplus(parent->d_inode); + dput(parent); + } } - fuse_advise_use_readdirplus(inode); return 1; } @@ -1159,25 +1165,23 @@ static int fuse_permission(struct inode *inode, int mask) } static int parse_dirfile(char *buf, size_t nbytes, struct file *file, - void *dstbuf, filldir_t filldir) + struct dir_context *ctx) { while (nbytes >= FUSE_NAME_OFFSET) { struct fuse_dirent *dirent = (struct fuse_dirent *) buf; size_t reclen = FUSE_DIRENT_SIZE(dirent); - int over; if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) return -EIO; if (reclen > nbytes) break; - over = filldir(dstbuf, dirent->name, dirent->namelen, - file->f_pos, dirent->ino, dirent->type); - if (over) + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) break; buf += reclen; nbytes -= reclen; - file->f_pos = dirent->off; + ctx->pos = dirent->off; } return 0; @@ -1278,7 +1282,7 @@ out: } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - void *dstbuf, filldir_t filldir, u64 attr_version) + struct dir_context *ctx, u64 attr_version) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; @@ -1303,10 +1307,9 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, we need to send a FORGET for each of those which we did not link. */ - over = filldir(dstbuf, dirent->name, dirent->namelen, - file->f_pos, dirent->ino, - dirent->type); - file->f_pos = dirent->off; + over = !dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type); + ctx->pos = dirent->off; } buf += reclen; @@ -1320,7 +1323,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, return 0; } -static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +static int fuse_readdir(struct file *file, struct dir_context *ctx) { int plus, err; size_t nbytes; @@ -1343,17 +1346,17 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) return -ENOMEM; } - plus = fuse_use_readdirplus(inode, file); + plus = fuse_use_readdirplus(inode, ctx); req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; req->page_descs[0].length = PAGE_SIZE; if (plus) { attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, FUSE_READDIR); } fuse_request_send(fc, req); @@ -1363,11 +1366,11 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) if (!err) { if (plus) { err = parse_dirplusfile(page_address(page), nbytes, - file, dstbuf, filldir, + file, ctx, attr_version); } else { err = parse_dirfile(page_address(page), nbytes, file, - dstbuf, filldir); + ctx); } } @@ -1880,7 +1883,7 @@ static const struct inode_operations fuse_dir_inode_operations = { static const struct file_operations fuse_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = fuse_readdir, + .iterate = fuse_readdir, .open = fuse_dir_open, .release = fuse_dir_release, .fsync = fuse_dir_fsync, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d1c9b85..5c121fe 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -16,6 +16,7 @@ #include <linux/compat.h> #include <linux/swap.h> #include <linux/aio.h> +#include <linux/falloc.h> static const struct file_operations fuse_direct_io_file_operations; @@ -547,8 +548,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) res = io->bytes < 0 ? io->size : io->bytes; if (!is_sync_kiocb(io->iocb)) { - struct path *path = &io->iocb->ki_filp->f_path; - struct inode *inode = path->dentry->d_inode; + struct inode *inode = file_inode(io->iocb->ki_filp); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1278,7 +1278,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1314,7 +1317,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, + fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } @@ -2365,6 +2372,11 @@ static void fuse_do_truncate(struct file *file) fuse_do_setattr(inode, &attr, file); } +static inline loff_t fuse_round_up(loff_t off) +{ + return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); +} + static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2372,6 +2384,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; @@ -2383,10 +2396,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, i_size = i_size_read(inode); /* optimization for short read */ - if (rw != WRITE && offset + count > i_size) { + if (async_dio && rw != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - count = i_size - offset; + count = min_t(loff_t, count, fuse_round_up(i_size - offset)); } io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); @@ -2404,7 +2417,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = ff->fc->async_dio; + io->async = async_dio; io->iocb = iocb; /* @@ -2412,7 +2425,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * to wait on real async I/O requests, so we must submit this request * synchronously. */ - if (!is_sync_kiocb(iocb) && (offset + count > i_size)) + if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) io->async = false; if (rw == WRITE) @@ -2424,7 +2437,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (ret > 0 && !is_sync_kiocb(iocb)) + if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; ret = wait_on_sync_kiocb(iocb); @@ -2446,6 +2459,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; + struct inode *inode = file->f_inode; struct fuse_conn *fc = ff->fc; struct fuse_req *req; struct fuse_fallocate_in inarg = { @@ -2455,13 +2469,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, .mode = mode }; int err; + bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || + (mode & FALLOC_FL_PUNCH_HOLE); if (fc->no_fallocate) return -EOPNOTSUPP; + if (lock_inode) { + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) + fuse_set_nowrite(inode); + } + req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } req->in.h.opcode = FUSE_FALLOCATE; req->in.h.nodeid = ff->nodeid; @@ -2476,6 +2500,25 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } fuse_put_request(fc, req); + if (err) + goto out; + + /* we could have extended the file */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) + fuse_write_update_size(inode, offset + length); + + if (mode & FALLOC_FL_PUNCH_HOLE) + truncate_pagecache_range(inode, offset, offset + length - 1); + + fuse_invalidate_attr(inode); + +out: + if (lock_inode) { + if (mode & FALLOC_FL_PUNCH_HOLE) + fuse_release_nowrite(inode); + mutex_unlock(&inode->i_mutex); + } + return err; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6201f81..0b57859 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -785,7 +785,7 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { if (*limit == 0) - *limit = ((num_physpages << PAGE_SHIFT) >> 13) / + *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / sizeof(struct fuse_req); if (*limit >= 1 << 16) @@ -867,10 +867,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) + if (arg->flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) - fc->readdirplus_auto = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; + } if (arg->flags & FUSE_ASYNC_DIO) fc->async_dio = 1; } else { diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index eb08c9e..90c6a8f 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -20,13 +20,12 @@ config GFS2_FS be found here: http://sources.redhat.com/cluster The "nolock" lock module is now built in to GFS2 by default. If - you want to use the DLM, be sure to enable HOTPLUG and IPv4/6 - networking. + you want to use the DLM, be sure to enable IPv4/6 networking. config GFS2_FS_LOCKING_DLM bool "GFS2 DLM locking" depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ - HOTPLUG && DLM && CONFIGFS_FS && SYSFS + CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) help Multiple node locking module for GFS2 diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 0bad69e..ee48ad3 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page, /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); goto out; } return 1; @@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, /* Is the page fully outside i_size? (truncate in progress) */ if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0); + page->mapping->a_ops->invalidatepage(page, 0, + PAGE_CACHE_SIZE); unlock_page(page); continue; } @@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) unlock_buffer(bh); } -static void gfs2_invalidatepage(struct page *page, unsigned long offset) +static void gfs2_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + unsigned int stop = offset + length; + int partial_page = (offset || length < PAGE_CACHE_SIZE); struct buffer_head *bh, *head; unsigned long pos = 0; BUG_ON(!PageLocked(page)); - if (offset == 0) + if (!partial_page) ClearPageChecked(page); if (!page_has_buffers(page)) goto out; bh = head = page_buffers(page); do { + if (pos + bh->b_size > stop) + return; + if (offset <= pos) gfs2_discard(sdp, bh); pos += bh->b_size; bh = bh->b_this_page; } while (bh != head); out: - if (offset == 0) + if (!partial_page) try_to_release_page(page, 0); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1dc9a13..5e2f56f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size) unstuff = 1; } - error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); + error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? + 0 : RES_QUOTA), 0); if (error) goto do_grow_release; @@ -1286,17 +1288,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) if (ret) return ret; + ret = get_write_access(inode); + if (ret) + return ret; + inode_dio_wait(inode); ret = gfs2_rs_alloc(GFS2_I(inode)); if (ret) - return ret; + goto out; oldsize = inode->i_size; - if (newsize >= oldsize) - return do_grow(inode, newsize); + if (newsize >= oldsize) { + ret = do_grow(inode, newsize); + goto out; + } - return do_shrink(inode, oldsize, newsize); + ret = do_shrink(inode, oldsize, newsize); +out: + put_write_access(inode); + return ret; } int gfs2_truncatei_resume(struct gfs2_inode *ip) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 4fddb3c..f2448ab 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -109,8 +109,7 @@ fail: return 0; } -static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode, - struct qstr *str) +static int gfs2_dhash(const struct dentry *dentry, struct qstr *str) { str->hash = gfs2_disk_hash(str->name, str->len); return 0; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3e82bd..0cb4c155 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) return ERR_PTR(-EIO); } - hc = kmalloc(hsize, GFP_NOFS); - ret = -ENOMEM; + hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); + if (hc == NULL) + hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + if (hc == NULL) return ERR_PTR(-ENOMEM); ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); return ERR_PTR(ret); } spin_lock(&inode->i_lock); - if (ip->i_hash_cache) - kfree(hc); - else + if (ip->i_hash_cache) { + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); + } else { ip->i_hash_cache = hc; + } spin_unlock(&inode->i_lock); return ip->i_hash_cache; @@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip) { __be64 *hc = ip->i_hash_cache; ip->i_hash_cache = NULL; - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); } static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) @@ -1113,10 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip) if (IS_ERR(hc)) return PTR_ERR(hc); - h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); + hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); + if (hc2 == NULL) + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + if (!hc2) return -ENOMEM; + h = hc2; error = gfs2_meta_inode_buffer(dip, &dibh); if (error) goto out_kfree; @@ -1145,7 +1161,10 @@ fail: gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); out_kfree: - kfree(hc2); + if (is_vmalloc_addr(hc2)) + vfree(hc2); + else + kfree(hc2); return error; } @@ -1194,9 +1213,7 @@ static int compare_dents(const void *a, const void *b) /** * do_filldir_main - read out directory entries * @dip: The GFS2 inode - * @offset: The offset in the file to read from - * @opaque: opaque data to pass to filldir - * @filldir: The function to pass entries to + * @ctx: what to feed the entries to * @darr: an array of struct gfs2_dirent pointers to read * @entries: the number of entries in darr * @copied: pointer to int that's non-zero if a entry has been copied out @@ -1206,11 +1223,10 @@ static int compare_dents(const void *a, const void *b) * the possibility that they will fall into different readdir buffers or * that someone will want to seek to that location. * - * Returns: errno, >0 on exception from filldir + * Returns: errno, >0 if the actor tells you to stop */ -static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, - void *opaque, filldir_t filldir, +static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, const struct gfs2_dirent **darr, u32 entries, int *copied) { @@ -1218,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, u64 off, off_next; unsigned int x, y; int run = 0; - int error = 0; sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); @@ -1235,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, off_next = be32_to_cpu(dent_next->de_hash); off_next = gfs2_disk_hash2offset(off_next); - if (off < *offset) + if (off < ctx->pos) continue; - *offset = off; + ctx->pos = off; if (off_next == off) { if (*copied && !run) @@ -1246,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, } else run = 0; } else { - if (off < *offset) + if (off < ctx->pos) continue; - *offset = off; + ctx->pos = off; } - error = filldir(opaque, (const char *)(dent + 1), + if (!dir_emit(ctx, (const char *)(dent + 1), be16_to_cpu(dent->de_name_len), - off, be64_to_cpu(dent->de_inum.no_addr), - be16_to_cpu(dent->de_type)); - if (error) + be64_to_cpu(dent->de_inum.no_addr), + be16_to_cpu(dent->de_type))) return 1; *copied = 1; } - /* Increment the *offset by one, so the next time we come into the + /* Increment the ctx->pos by one, so the next time we come into the do_filldir fxn, we get the next entry instead of the last one in the current leaf */ - (*offset)++; + ctx->pos++; return 0; } @@ -1289,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr) kfree(ptr); } -static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, int *copied, unsigned *depth, +static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, + int *copied, unsigned *depth, u64 leaf_no) { struct gfs2_inode *ip = GFS2_I(inode); @@ -1368,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, } while(lfn); BUG_ON(entries2 != entries); - error = do_filldir_main(ip, offset, opaque, filldir, darr, - entries, copied); + error = do_filldir_main(ip, ctx, darr, entries, copied); out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); @@ -1428,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, /** * dir_e_read - Reads the entries from a directory into a filldir buffer * @dip: dinode pointer - * @offset: the hash of the last entry read shifted to the right once - * @opaque: buffer for the filldir function to fill - * @filldir: points to the filldir function to use + * @ctx: actor to feed the entries to * * Returns: errno */ -static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, struct file_ra_state *f_ra) +static int dir_e_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); u32 hsize, len = 0; @@ -1447,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, unsigned depth = 0; hsize = 1 << dip->i_depth; - hash = gfs2_dir_offset2hash(*offset); + hash = gfs2_dir_offset2hash(ctx->pos); index = hash >> (32 - dip->i_depth); if (dip->i_hash_cache == NULL) @@ -1459,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, gfs2_dir_readahead(inode, hsize, index, f_ra); while (index < hsize) { - error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, + error = gfs2_dir_read_leaf(inode, ctx, &copied, &depth, be64_to_cpu(lp[index])); if (error) @@ -1474,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, return error; } -int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, struct file_ra_state *f_ra) +int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) { struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1489,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, return 0; if (dip->i_diskflags & GFS2_DIF_EXHASH) - return dir_e_read(inode, offset, opaque, filldir, f_ra); + return dir_e_read(inode, ctx, f_ra); if (!gfs2_is_stuffed(dip)) { gfs2_consist_inode(dip); @@ -1521,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, error = -EIO; goto out; } - error = do_filldir_main(dip, offset, opaque, filldir, darr, + error = do_filldir_main(dip, ctx, darr, dip->i_entries, &copied); out: kfree(darr); @@ -1537,9 +1548,9 @@ out: /** * gfs2_dir_search - Search a directory - * @dip: The GFS2 inode - * @filename: - * @inode: + * @dip: The GFS2 dir inode + * @name: The name we are looking up + * @fail_on_exist: Fail if the name exists rather than looking it up * * This routine searches a directory for a file or another directory. * Assumes a glock is held on dip. @@ -1547,22 +1558,25 @@ out: * Returns: errno */ -struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) +struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, + bool fail_on_exist) { struct buffer_head *bh; struct gfs2_dirent *dent; - struct inode *inode; + u64 addr, formal_ino; + u16 dtype; dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); if (dent) { if (IS_ERR(dent)) return ERR_CAST(dent); - inode = gfs2_inode_lookup(dir->i_sb, - be16_to_cpu(dent->de_type), - be64_to_cpu(dent->de_inum.no_addr), - be64_to_cpu(dent->de_inum.no_formal_ino), 0); + dtype = be16_to_cpu(dent->de_type); + addr = be64_to_cpu(dent->de_inum.no_addr); + formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); brelse(bh); - return inode; + if (fail_on_exist) + return ERR_PTR(-EEXIST); + return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); } return ERR_PTR(-ENOENT); } @@ -1846,6 +1860,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); ht = kzalloc(size, GFP_NOFS); + if (ht == NULL) + ht = vzalloc(size); if (!ht) return -ENOMEM; @@ -1933,7 +1949,10 @@ out_rlist: gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); out: - kfree(ht); + if (is_vmalloc_addr(ht)) + vfree(ht); + else + kfree(ht); return error; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 98c960b..4f03bbd 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -18,14 +18,15 @@ struct gfs2_inode; struct gfs2_inum; extern struct inode *gfs2_dir_search(struct inode *dir, - const struct qstr *filename); + const struct qstr *filename, + bool fail_on_exist); extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); -extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, - filldir_t filldir, struct file_ra_state *f_ra); +extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra); extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, const struct gfs2_inode *nip, unsigned int new_type); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9973df4..8b9b377 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, } struct get_name_filldir { + struct dir_context ctx; struct gfs2_inum_host inum; char *name; }; @@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name, struct inode *dir = parent->d_inode; struct inode *inode = child->d_inode; struct gfs2_inode *dip, *ip; - struct get_name_filldir gnfd; + struct get_name_filldir gnfd = { + .ctx.actor = get_name_filldir, + .name = name + }; struct gfs2_holder gh; - u64 offset = 0; int error; struct file_ra_state f_ra = { .start = 0 }; @@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name, *name = 0; gnfd.inum.no_addr = ip->i_no_addr; gnfd.inum.no_formal_ino = ip->i_no_formal_ino; - gnfd.name = name; error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); if (error) return error; - error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra); + error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra); gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index acd1676..72c3866 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -82,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) } /** - * gfs2_readdir - Read directory entries from a directory + * gfs2_readdir - Iterator for a directory * @file: The directory to read from - * @dirent: Buffer for dirents - * @filldir: Function used to do the copying + * @ctx: What to feed directory entries to * * Returns: errno */ -static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) +static int gfs2_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file->f_mapping->host; struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_holder d_gh; - u64 offset = file->f_pos; int error; - gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); - error = gfs2_glock_nq(&d_gh); - if (error) { - gfs2_holder_uninit(&d_gh); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) return error; - } - error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra); + error = gfs2_dir_read(dir, ctx, &file->f_ra); gfs2_glock_dq_uninit(&d_gh); - file->f_pos = offset; - return error; } @@ -402,16 +395,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(vma->vm_file); + ret = get_write_access(inode); + if (ret) + goto out; + ret = gfs2_rs_alloc(ip); if (ret) - return ret; + goto out_write_access; gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) - goto out; + goto out_uninit; set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); @@ -480,12 +477,15 @@ out_quota_unlock: gfs2_quota_unlock(ip); out_unlock: gfs2_glock_dq(&gh); -out: +out_uninit: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); wait_for_stable_page(page); } +out_write_access: + put_write_access(inode); +out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } @@ -531,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) } /** - * gfs2_open - open a file - * @inode: the inode to open - * @file: the struct file for this opening + * gfs2_open_common - This is common to open and atomic_open + * @inode: The inode being opened + * @file: The file being opened * - * Returns: errno + * This maybe called under a glock or not depending upon how it has + * been called. We must always be called under a glock for regular + * files, however. For other file types, it does not matter whether + * we hold the glock or not. + * + * Returns: Error code or 0 for success */ -static int gfs2_open(struct inode *inode, struct file *file) +int gfs2_open_common(struct inode *inode, struct file *file) { - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder i_gh; struct gfs2_file *fp; - int error; + int ret; + + if (S_ISREG(inode->i_mode)) { + ret = generic_file_open(inode, file); + if (ret) + return ret; + } - fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); + fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS); if (!fp) return -ENOMEM; @@ -553,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file) gfs2_assert_warn(GFS2_SB(inode), !file->private_data); file->private_data = fp; + return 0; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * After atomic_open, this function is only used for opening files + * which are already cached. We must still get the glock for regular + * files to ensure that we have the file size uptodate for the large + * file check which is in the common code. That is only an issue for + * regular files though. + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + bool need_unlock = false; if (S_ISREG(ip->i_inode.i_mode)) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) - goto fail; + return error; + need_unlock = true; + } - if (!(file->f_flags & O_LARGEFILE) && - i_size_read(inode) > MAX_NON_LFS) { - error = -EOVERFLOW; - goto fail_gunlock; - } + error = gfs2_open_common(inode, file); + if (need_unlock) gfs2_glock_dq_uninit(&i_gh); - } - - return 0; -fail_gunlock: - gfs2_glock_dq_uninit(&i_gh); -fail: - file->private_data = NULL; - kfree(fp); return error; } @@ -594,10 +617,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if ((file->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) - gfs2_rs_delete(ip); + if (!(file->f_mode & FMODE_WRITE)) + return 0; + gfs2_rs_delete(ip); return 0; } @@ -889,7 +912,7 @@ out_uninit: * cluster; until we do, disable leases (by just returning -EINVAL), * unless the administrator has requested purely local locking. * - * Locking: called under lock_flocks + * Locking: called under i_lock * * Returns: errno */ @@ -1041,7 +1064,7 @@ const struct file_operations gfs2_file_fops = { }; const struct file_operations gfs2_dir_fops = { - .readdir = gfs2_readdir, + .iterate = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, .release = gfs2_release, @@ -1071,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = { }; const struct file_operations gfs2_dir_fops_nolock = { - .readdir = gfs2_readdir, + .iterate = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, .release = gfs2_release, diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c66e99c..5f2e522 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) struct gfs2_bufdata *bd, *tmp; struct buffer_head *bh; const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); - sector_t blocknr; gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); @@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) continue; gfs2_ail_error(gl, bh); } - blocknr = bh->b_blocknr; - bh->b_private = NULL; - gfs2_remove_from_ail(bd); /* drops ref on bh */ - - bd->bd_bh = NULL; - bd->bd_blkno = blocknr; - gfs2_trans_add_revoke(sdp, bd); } GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8833a4f..bbb2715 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail_refresh: + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_iopen: @@ -312,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, goto out; } - inode = gfs2_dir_search(dir, name); + inode = gfs2_dir_search(dir, name, false); if (IS_ERR(inode)) error = PTR_ERR(inode); out: @@ -345,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, if (!dip->i_inode.i_nlink) return -ENOENT; - error = gfs2_dir_check(&dip->i_inode, name, NULL); - switch (error) { - case -ENOENT: - error = 0; - break; - case 0: - return -EEXIST; - default: - return error; - } - if (dip->i_entries == (u32)-1) return -EFBIG; if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) @@ -545,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, * gfs2_create_inode - Create a new inode * @dir: The parent directory * @dentry: The new dentry + * @file: If non-NULL, the file which is being opened * @mode: The permissions on the new inode * @dev: For device nodes, this is the device number * @symname: For symlinks, this is the link destination @@ -554,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, */ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, + struct file *file, umode_t mode, dev_t dev, const char *symname, - unsigned int size, int excl) + unsigned int size, int excl, int *opened) { const struct qstr *name = &dentry->d_name; struct gfs2_holder ghs[2]; @@ -563,6 +555,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; + struct dentry *d; int error; u32 aflags = 0; int arq; @@ -583,15 +576,30 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail; error = create_ok(dip, name, mode); - if ((error == -EEXIST) && S_ISREG(mode) && !excl) { - inode = gfs2_lookupi(dir, &dentry->d_name, 0); - gfs2_glock_dq_uninit(ghs); - d_instantiate(dentry, inode); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; - } if (error) goto fail_gunlock; + inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl); + error = PTR_ERR(inode); + if (!IS_ERR(inode)) { + d = d_splice_alias(inode, dentry); + error = 0; + if (file && !IS_ERR(d)) { + if (d == NULL) + d = dentry; + if (S_ISREG(inode->i_mode)) + error = finish_open(file, d, gfs2_open_common, opened); + else + error = finish_no_open(file, d); + } + gfs2_glock_dq_uninit(ghs); + if (IS_ERR(d)) + return PTR_RET(d); + return error; + } else if (error != -ENOENT) { + goto fail_gunlock; + } + arq = error = gfs2_diradd_alloc_required(dir, name); if (error < 0) goto fail_gunlock; @@ -685,10 +693,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock3; mark_inode_dirty(inode); + d_instantiate(dentry, inode); + if (file) + error = finish_open(file, dentry, gfs2_open_common, opened); gfs2_glock_dq_uninit(ghs); gfs2_glock_dq_uninit(ghs + 1); - d_instantiate(dentry, inode); - return 0; + return error; fail_gunlock3: gfs2_glock_dq_uninit(ghs + 1); @@ -728,36 +738,56 @@ fail: static int gfs2_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl); + return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL); } /** - * gfs2_lookup - Look up a filename in a directory and return its inode + * __gfs2_lookup - Look up a filename in a directory and return its inode * @dir: The directory inode * @dentry: The dentry of the new inode - * @nd: passed from Linux VFS, ignored by us + * @file: File to be opened + * @opened: atomic_open flags * - * Called by the VFS layer. Lock dir and call gfs2_lookupi() * * Returns: errno */ -static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) +static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct file *file, int *opened) { - struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (inode && !IS_ERR(inode)) { - struct gfs2_glock *gl = GFS2_I(inode)->i_gl; - struct gfs2_holder gh; - int error; - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) { - iput(inode); - return ERR_PTR(error); - } - gfs2_glock_dq_uninit(&gh); + struct inode *inode; + struct dentry *d; + struct gfs2_holder gh; + struct gfs2_glock *gl; + int error; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (!inode) + return NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + gl = GFS2_I(inode)->i_gl; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) { + iput(inode); + return ERR_PTR(error); } - return d_splice_alias(inode, dentry); + + d = d_splice_alias(inode, dentry); + if (file && S_ISREG(inode->i_mode)) + error = finish_open(file, dentry, gfs2_open_common, opened); + + gfs2_glock_dq_uninit(&gh); + if (error) + return ERR_PTR(error); + return d; +} + +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + unsigned flags) +{ + return __gfs2_lookup(dir, dentry, NULL, NULL); } /** @@ -1075,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) return -ENAMETOOLONG; - return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); + return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL); } /** @@ -1091,7 +1121,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct gfs2_sbd *sdp = GFS2_SB(dir); unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); - return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0); + return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL); } /** @@ -1106,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0); + return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL); +} + +/** + * gfs2_atomic_open - Atomically open a file + * @dir: The directory + * @dentry: The proposed new entry + * @file: The proposed new struct file + * @flags: open flags + * @mode: File mode + * @opened: Flag to say whether the file has been opened or not + * + * Returns: error code or 0 for success + */ + +static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, + umode_t mode, int *opened) +{ + struct dentry *d; + bool excl = !!(flags & O_EXCL); + + d = __gfs2_lookup(dir, dentry, file, opened); + if (IS_ERR(d)) + return PTR_ERR(d); + if (d == NULL) + d = dentry; + if (d->d_inode) { + if (!(*opened & FILE_OPENED)) + return finish_no_open(file, d); + return 0; + } + + if (!(flags & O_CREAT)) + return -ENOENT; + + return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened); } /* @@ -1786,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = { .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, + .atomic_open = gfs2_atomic_open, }; const struct inode_operations gfs2_symlink_iops = { diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c53c747..ba4d949 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask); extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +extern int gfs2_open_common(struct inode *inode, struct file *file); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index b404f48..610613f 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -211,15 +211,16 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) static int gfs2_ail1_empty(struct gfs2_sbd *sdp) { struct gfs2_trans *tr, *s; + int oldest_tr = 1; int ret; spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { gfs2_ail1_empty_one(sdp, tr); - if (list_empty(&tr->tr_ail1_list)) + if (list_empty(&tr->tr_ail1_list) && oldest_tr) list_move(&tr->tr_list, &sdp->sd_ail2_list); else - break; + oldest_tr = 0; } ret = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); @@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) { - unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); + unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize); unsigned wanted = blks + reserved_blks; DEFINE_WAIT(wait); int did_wait = 0; @@ -545,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip) spin_unlock(&sdp->sd_ordered_lock); } +void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + struct buffer_head *bh = bd->bd_bh; + struct gfs2_glock *gl = bd->bd_gl; + + gfs2_remove_from_ail(bd); + bd->bd_bh = NULL; + bh->b_private = NULL; + bd->bd_blkno = bh->b_blocknr; + bd->bd_ops = &gfs2_revoke_lops; + sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&bd->bd_list, &sdp->sd_log_le_revoke); +} + +void gfs2_write_revokes(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd, *tmp; + int have_revokes = 0; + int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + + gfs2_ail1_empty(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) { + if (list_empty(&bd->bd_list)) { + have_revokes = 1; + goto done; + } + } + } +done: + spin_unlock(&sdp->sd_ail_lock); + if (have_revokes == 0) + return; + while (sdp->sd_log_num_revoke > max_revokes) + max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + max_revokes -= sdp->sd_log_num_revoke; + if (!sdp->sd_log_num_revoke) { + atomic_dec(&sdp->sd_log_blks_free); + /* If no blocks have been reserved, we need to also + * reserve a block for the header */ + if (!sdp->sd_log_blks_reserved) + atomic_dec(&sdp->sd_log_blks_free); + } + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) { + if (max_revokes == 0) + goto out_of_blocks; + if (!list_empty(&bd->bd_list)) + continue; + gfs2_add_revoke(sdp, bd); + max_revokes--; + } + } +out_of_blocks: + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + + if (!sdp->sd_log_num_revoke) { + atomic_inc(&sdp->sd_log_blks_free); + if (!sdp->sd_log_blks_reserved) + atomic_inc(&sdp->sd_log_blks_free); + } +} + /** * log_write_header - Get and initialize a journal header buffer * @sdp: The GFS2 superblock @@ -562,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) lh = page_address(page); clear_page(lh); - gfs2_ail1_empty(sdp); tail = current_tail(sdp); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 3566f35..3721663 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); +extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_write_revokes(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index c5fa758..17c5b5d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -16,6 +16,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/bio.h> #include <linux/fs.h> +#include <linux/list_sort.h> #include "gfs2.h" #include "incore.h" @@ -212,7 +213,7 @@ static void gfs2_end_log_write(struct bio *bio, int error) fs_err(sdp, "Error %d writing to log\n", error); } - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) gfs2_end_log_write_bh(sdp, bvec, error); @@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh) kunmap_atomic(kaddr); } +static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_bufdata *bda, *bdb; + + bda = list_entry(a, struct gfs2_bufdata, bd_list); + bdb = list_entry(b, struct gfs2_bufdata, bd_list); + + if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + return -1; + if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + return 1; + return 0; +} + static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, unsigned int total, struct list_head *blist, bool is_databuf) @@ -413,13 +428,16 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, __be64 *ptr; gfs2_log_lock(sdp); + list_sort(NULL, blist, blocknr_cmp); bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list); while(total) { num = total; if (total > limit) num = limit; gfs2_log_unlock(sdp); - page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num); + page = gfs2_get_log_desc(sdp, + is_databuf ? GFS2_LOG_DESC_JDATA : + GFS2_LOG_DESC_METADATA, num + 1, num); ld = page_address(page); gfs2_log_lock(sdp); ptr = (__be64 *)(ld + 1); @@ -588,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) struct page *page; unsigned int length; + gfs2_write_revokes(sdp); if (!sdp->sd_log_num_revoke) return; @@ -834,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = { .lo_name = "revoke", }; -const struct gfs2_log_operations gfs2_rg_lops = { - .lo_name = "rg", -}; - const struct gfs2_log_operations gfs2_databuf_lops = { .lo_before_commit = databuf_lo_before_commit, .lo_after_commit = databuf_lo_after_commit, @@ -849,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = { const struct gfs2_log_operations *gfs2_log_ops[] = { &gfs2_databuf_lops, &gfs2_buf_lops, - &gfs2_rg_lops, &gfs2_revoke_lops, NULL, }; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 87e062e..9ca2e64 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -23,7 +23,6 @@ extern const struct gfs2_log_operations gfs2_glock_lops; extern const struct gfs2_log_operations gfs2_buf_lops; extern const struct gfs2_log_operations gfs2_revoke_lops; -extern const struct gfs2_log_operations gfs2_rg_lops; extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 1a89afb..0da3906 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -296,10 +296,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int if (bd) { spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { - gfs2_remove_from_ail(bd); - bh->b_private = NULL; - bd->bd_bh = NULL; - bd->bd_blkno = bh->b_blocknr; gfs2_trans_add_revoke(sdp, bd); } spin_unlock(&sdp->sd_ail_lock); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 60ede2a..0262c19 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo) goto fail_quotad; p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - error = IS_ERR(p); - if (error) { + if (IS_ERR(p)) { + error = PTR_ERR(p); fs_err(sdp, "can't start logd thread: %d\n", error); return error; } sdp->sd_logd_process = p; p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - error = IS_ERR(p); - if (error) { + if (IS_ERR(p)) { + error = PTR_ERR(p); fs_err(sdp, "can't start quotad thread: %d\n", error); goto fail; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c7c840e..3768c2f 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -121,7 +121,7 @@ static u64 qd2index(struct gfs2_quota_data *qd) { struct kqid qid = qd->qd_id; return (2 * (u64)from_kqid(&init_user_ns, qid)) + - (qid.type == USRQUOTA) ? 0 : 1; + ((qid.type == USRQUOTA) ? 0 : 1); } static u64 qd2offset(struct gfs2_quota_data *qd) @@ -721,7 +721,7 @@ get_a_page: goto unlock_out; } - gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_trans_add_data(ip->i_gl, bh); kaddr = kmap_atomic(page); if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) @@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type) return error; } -static int gfs2_quota_sync_timeo(struct super_block *sb, int type) -{ - return gfs2_quota_sync(sb, type); -} - int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; @@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data) &tune->gt_statfs_quantum); /* Update quota file */ - quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, + quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, "ad_timeo, &tune->gt_quota_quantum); /* Check for & recover partially truncated inodes */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0c5a575..6931743 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -638,8 +638,10 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) */ void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if (ip->i_res) { + if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) { gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); @@ -1286,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp) minlen = max_t(u64, r.minlen, q->limits.discard_granularity) >> bs_shift; + if (end <= start || minlen > sdp->sd_max_rg_data) + return -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, start, 0); - rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0); + rgd_end = gfs2_blk2rgrpd(sdp, end, 0); - if (end <= start || - minlen > sdp->sd_max_rg_data || - start > rgd_end->rd_data0 + rgd_end->rd_data) - return -EINVAL; + if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end)) + && (start > rgd_end->rd_data0 + rgd_end->rd_data)) + return -EINVAL; /* start is beyond the end of the fs */ while (1) { @@ -1334,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) } out: - r.len = trimmed << 9; + r.len = trimmed << bs_shift; if (copy_to_user(argp, &r, sizeof(r))) return -EFAULT; @@ -1401,9 +1405,14 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, u32 extlen; u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; int ret; + struct inode *inode = &ip->i_inode; - extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); - extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + if (S_ISDIR(inode->i_mode)) + extlen = 1; + else { + extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested); + extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks); + } if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) return; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 917c8e1..e5639de 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1444,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode) /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } @@ -1514,8 +1515,10 @@ out_unlock: if (gfs2_rs_active(ip->i_res)) gfs2_rs_deltree(ip->i_res); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); + } gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) @@ -1534,6 +1537,7 @@ out: ip->i_gl = NULL; if (ip->i_iopen_gh.gh_gl) { ip->i_iopen_gh.gh_gl->gl_object = NULL; + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 7374907..2b20d70 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -270,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - struct gfs2_glock *gl = bd->bd_gl; struct gfs2_trans *tr = current->journal_info; BUG_ON(!list_empty(&bd->bd_list)); - BUG_ON(!list_empty(&bd->bd_ail_st_list)); - BUG_ON(!list_empty(&bd->bd_ail_gl_list)); - bd->bd_ops = &gfs2_revoke_lops; + gfs2_add_revoke(sdp, bd); tr->tr_touched = 1; tr->tr_num_revoke++; - sdp->sd_log_num_revoke++; - atomic_inc(&gl->gl_revokes); - set_bit(GLF_LFLUSH, &gl->gl_flags); - list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index f3b1a15..d3fa6bd 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -415,7 +415,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, num); spin_unlock(&tree->hash_lock); - BUG_ON(node); + if (node) { + pr_crit("new node %u already hashed?\n", num); + WARN_ON(1); + return node; + } node = __hfs_bnode_create(tree, num); if (!node) return ERR_PTR(-ENOMEM); diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index e0101b6..1455668 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -51,9 +51,9 @@ done: /* * hfs_readdir */ -static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int hfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFS_MAX_NAMELEN]; @@ -62,7 +62,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct hfs_readdir_data *rd; u16 type; - if (filp->f_pos >= inode->i_size) + if (ctx->pos >= inode->i_size) return 0; err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); @@ -73,14 +73,13 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (err) goto out; - switch ((u32)filp->f_pos) { - case 0: + if (ctx->pos == 0) { /* This is completely artificial... */ - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) + if (!dir_emit_dot(file, ctx)) goto out; - filp->f_pos++; - /* fall through */ - case 1: + ctx->pos = 1; + } + if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; @@ -97,18 +96,16 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) // err = -EIO; // goto out; //} - if (filldir(dirent, "..", 2, 1, + if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.ParID), DT_DIR)) goto out; - filp->f_pos++; - /* fall through */ - default: - if (filp->f_pos >= inode->i_size) - goto out; - err = hfs_brec_goto(&fd, filp->f_pos - 1); - if (err) - goto out; + ctx->pos = 2; } + if (ctx->pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, ctx->pos - 1); + if (err) + goto out; for (;;) { if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { @@ -131,7 +128,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - if (filldir(dirent, strbuf, len, filp->f_pos, + if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.dir.DirID), DT_DIR)) break; } else if (type == HFS_CDR_FIL) { @@ -140,7 +137,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - if (filldir(dirent, strbuf, len, filp->f_pos, + if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.FlNum), DT_REG)) break; } else { @@ -148,22 +145,22 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - filp->f_pos++; - if (filp->f_pos >= inode->i_size) + ctx->pos++; + if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } - rd = filp->private_data; + rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } - filp->private_data = rd; - rd->file = filp; + file->private_data = rd; + rd->file = file; list_add(&rd->list, &HFS_I(inode)->open_dir_list); } memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key)); @@ -306,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, const struct file_operations hfs_dir_operations = { .read = generic_read_dir, - .readdir = hfs_readdir, + .iterate = hfs_readdir, .llseek = generic_file_llseek, .release = hfs_dir_release, }; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index a73b118..0524cda 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -229,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); /* string.c */ extern const struct dentry_operations hfs_dentry_operations; -extern int hfs_hash_dentry(const struct dentry *, const struct inode *, - struct qstr *); +extern int hfs_hash_dentry(const struct dentry *, struct qstr *); extern int hfs_strcmp(const unsigned char *, unsigned int, const unsigned char *, unsigned int); -extern int hfs_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* trans.c */ diff --git a/fs/hfs/string.c b/fs/hfs/string.c index 495a976..85b610c 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -51,8 +51,7 @@ static unsigned char caseorder[256] = { /* * Hash a string to an integer in a case-independent way */ -int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *this) +int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this) { const unsigned char *name = this->name; unsigned int hash, len = this->len; @@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1, * Test for equality of two strings in the HFS filename character ordering. * return 1 on failure and 0 on success */ -int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const unsigned char *n1, *n2; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index a37ac93..d8ce4bd 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -121,9 +121,9 @@ fail: return ERR_PTR(err); } -static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int hfsplus_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFSPLUS_MAX_STRLEN + 1]; @@ -132,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) struct hfsplus_readdir_data *rd; u16 type; - if (filp->f_pos >= inode->i_size) + if (file->f_pos >= inode->i_size) return 0; err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); @@ -143,14 +143,13 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) if (err) goto out; - switch ((u32)filp->f_pos) { - case 0: + if (ctx->pos == 0) { /* This is completely artificial... */ - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) + if (!dir_emit_dot(file, ctx)) goto out; - filp->f_pos++; - /* fall through */ - case 1: + ctx->pos = 1; + } + if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; @@ -168,19 +167,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - if (filldir(dirent, "..", 2, 1, + if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.parentID), DT_DIR)) goto out; - filp->f_pos++; - /* fall through */ - default: - if (filp->f_pos >= inode->i_size) - goto out; - err = hfs_brec_goto(&fd, filp->f_pos - 1); - if (err) - goto out; + ctx->pos = 2; } - + if (ctx->pos >= inode->i_size) + goto out; + err = hfs_brec_goto(&fd, ctx->pos - 1); + if (err) + goto out; for (;;) { if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { pr_err("walked past end of dir\n"); @@ -211,7 +207,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) HFSPLUS_SB(sb)->hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) goto next; - if (filldir(dirent, strbuf, len, filp->f_pos, + if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.folder.id), DT_DIR)) break; } else if (type == HFSPLUS_FILE) { @@ -220,7 +216,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) err = -EIO; goto out; } - if (filldir(dirent, strbuf, len, filp->f_pos, + if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.id), DT_REG)) break; } else { @@ -229,22 +225,22 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; } next: - filp->f_pos++; - if (filp->f_pos >= inode->i_size) + ctx->pos++; + if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } - rd = filp->private_data; + rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } - filp->private_data = rd; - rd->file = filp; + file->private_data = rd; + rd->file = file; list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); } memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); @@ -538,7 +534,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { const struct file_operations hfsplus_dir_operations = { .fsync = hfsplus_file_fsync, .read = generic_read_dir, - .readdir = hfsplus_readdir, + .iterate = hfsplus_readdir, .unlocked_ioctl = hfsplus_ioctl, .llseek = generic_file_llseek, .release = hfsplus_dir_release, diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 60b0a33..ede7931 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -495,11 +495,8 @@ int hfsplus_uni2asc(struct super_block *, const struct hfsplus_unistr *, char *, int *); int hfsplus_asc2uni(struct super_block *, struct hfsplus_unistr *, int, const char *, int); -int hfsplus_hash_dentry(const struct dentry *dentry, - const struct inode *inode, struct qstr *str); -int hfsplus_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); +int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* wrapper.c */ diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index 2c2e47d..e8ef121 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -334,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb, * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *str) +int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) { struct super_block *sb = dentry->d_sb; const char *astr; @@ -386,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ -int hfsplus_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct super_block *sb = parent->d_sb; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 32f35f1..cddb052 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = { .show_options = hostfs_show_options, }; -int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) +int hostfs_readdir(struct file *file, struct dir_context *ctx) { void *dir; char *name; @@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) __putname(name); if (dir == NULL) return -error; - next = file->f_pos; + next = ctx->pos; while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) { - error = (*filldir)(ent, name, len, file->f_pos, - ino, type); - if (error) break; - file->f_pos = next; + if (!dir_emit(ctx, name, len, ino, type)) + break; + ctx->pos = next; } close_dir(dir); return 0; @@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = { static const struct file_operations hostfs_dir_fops = { .llseek = generic_file_llseek, - .readdir = hostfs_readdir, + .iterate = hostfs_readdir, .read = generic_read_dir, }; diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index f49d149..4d0a1af 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -7,8 +7,37 @@ */ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include "hpfs_fn.h" +void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n) +{ + struct buffer_head *bh; + struct blk_plug plug; + + if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size)) + return; + + bh = sb_find_get_block(s, secno); + if (bh) { + if (buffer_uptodate(bh)) { + brelse(bh); + return; + } + brelse(bh); + }; + + blk_start_plug(&plug); + while (n > 0) { + if (unlikely(secno >= hpfs_sb(s)->sb_fs_size)) + break; + sb_breadahead(s, secno); + secno++; + n--; + } + blk_finish_plug(&plug); +} + /* Map a sector into a buffer and return pointers to it and to the buffer. */ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp, @@ -18,6 +47,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head hpfs_lock_assert(s); + hpfs_prefetch_sectors(s, secno, ahead); + cond_resched(); *bhp = bh = sb_bread(s, secno); @@ -67,6 +98,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe return NULL; } + hpfs_prefetch_sectors(s, secno, 4 + ahead); + qbh->data = data = kmalloc(2048, GFP_NOFS); if (!data) { printk("HPFS: hpfs_map_4sectors: out of memory\n"); diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index 05d4816..fa27980 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -12,8 +12,7 @@ * Note: the dentry argument is the parent dentry. */ -static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { unsigned long hash; int i; @@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino return 0; } -static int hpfs_compare_dentry(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { unsigned al = len; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 546f6d3..292b1ac 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,36 +33,38 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; + mutex_lock(&i->i_mutex); hpfs_lock(s); /*printk("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; - mutex_lock(&i->i_mutex); pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); else goto fail; if (pos == 12) goto fail; } - mutex_unlock(&i->i_mutex); + hpfs_add_pos(i, &filp->f_pos); ok: + filp->f_pos = new_off; hpfs_unlock(s); - return filp->f_pos = new_off; -fail: mutex_unlock(&i->i_mutex); + return new_off; +fail: /*printk("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); + mutex_unlock(&i->i_mutex); return -ESPIPE; } -static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int hpfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); struct quad_buffer_head qbh; struct hpfs_dirent *de; int lc; - long old_pos; + loff_t next_pos; unsigned char *tempname; int c1, c2 = 0; int ret = 0; @@ -103,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } } lc = hpfs_sb(inode->i_sb)->sb_lowercase; - if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */ - filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */ + if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */ + ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */ goto out; } - if (filp->f_pos == 13) { + if (ctx->pos == 13) { ret = -ENOENT; goto out; } @@ -118,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) accepted by filldir, but what can I do? maybe killall -9 ls helps */ if (hpfs_sb(inode->i_sb)->sb_chk) - if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) { + if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) { ret = -EFSERROR; goto out; } - if (filp->f_pos == 12) + if (ctx->pos == 12) goto out; - if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) { - printk("HPFS: warning: pos==%d\n",(int)filp->f_pos); + if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) { + printk("HPFS: warning: pos==%d\n",(int)ctx->pos); goto out; } - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) goto out; - filp->f_pos = 11; + ctx->pos = 11; } - if (filp->f_pos == 11) { - if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0) + if (ctx->pos == 11) { + if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR)) goto out; - filp->f_pos = 1; + ctx->pos = 1; } - if (filp->f_pos == 1) { - filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; - hpfs_add_pos(inode, &filp->f_pos); - filp->f_version = inode->i_version; + if (ctx->pos == 1) { + ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; + hpfs_add_pos(inode, &file->f_pos); + file->f_version = inode->i_version; } - old_pos = filp->f_pos; - if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) { + next_pos = ctx->pos; + if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) { + ctx->pos = next_pos; ret = -EIOERROR; goto out; } @@ -152,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (hpfs_sb(inode->i_sb)->sb_chk) { if (de->first && !de->last && (de->namelen != 2 || de ->name[0] != 1 || de->name[1] != 1)) - hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos); + hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos); if (de->last && (de->namelen != 1 || de ->name[0] != 255)) - hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos); + hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos); } hpfs_brelse4(&qbh); + ctx->pos = next_pos; goto again; } tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); - if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) { - filp->f_pos = old_pos; + if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) { if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); goto out; } + ctx->pos = next_pos; if (tempname != de->name) kfree(tempname); hpfs_brelse4(&qbh); } @@ -320,7 +324,7 @@ const struct file_operations hpfs_dir_ops = { .llseek = hpfs_dir_lseek, .read = generic_read_dir, - .readdir = hpfs_readdir, + .iterate = hpfs_readdir, .release = hpfs_dir_release, .fsync = hpfs_file_fsync, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 3027f4d..4e9dabc 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -7,6 +7,7 @@ */ #include "hpfs_fn.h" +#include <linux/mpage.h> #define BLOCKS(size) (((size) + 511) >> 9) @@ -34,7 +35,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) * so we must ignore such errors. */ -static secno hpfs_bmap(struct inode *inode, unsigned file_secno) +static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs) { struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); unsigned n, disk_secno; @@ -42,11 +43,20 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno) struct buffer_head *bh; if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0; n = file_secno - hpfs_inode->i_file_sec; - if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n; + if (n < hpfs_inode->i_n_secs) { + *n_secs = hpfs_inode->i_n_secs - n; + return hpfs_inode->i_disk_sec + n; + } if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0; disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh); if (disk_secno == -1) return 0; if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0; + n = file_secno - hpfs_inode->i_file_sec; + if (n < hpfs_inode->i_n_secs) { + *n_secs = hpfs_inode->i_n_secs - n; + return hpfs_inode->i_disk_sec + n; + } + *n_secs = 1; return disk_secno; } @@ -67,10 +77,14 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he { int r; secno s; + unsigned n_secs; hpfs_lock(inode->i_sb); - s = hpfs_bmap(inode, iblock); + s = hpfs_bmap(inode, iblock, &n_secs); if (s) { + if (bh_result->b_size >> 9 < n_secs) + n_secs = bh_result->b_size >> 9; map_bh(bh_result, inode->i_sb, s); + bh_result->b_size = n_secs << 9; goto ret_0; } if (!create) goto ret_0; @@ -95,24 +109,40 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he return r; } +static int hpfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, hpfs_get_block); +} + static int hpfs_writepage(struct page *page, struct writeback_control *wbc) { - return block_write_full_page(page,hpfs_get_block, wbc); + return block_write_full_page(page, hpfs_get_block, wbc); } -static int hpfs_readpage(struct file *file, struct page *page) +static int hpfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) { - return block_read_full_page(page,hpfs_get_block); + return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); +} + +static int hpfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, hpfs_get_block); } static void hpfs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + hpfs_lock(inode->i_sb); + if (to > inode->i_size) { truncate_pagecache(inode, to, inode->i_size); hpfs_truncate(inode); } + + hpfs_unlock(inode->i_sb); } static int hpfs_write_begin(struct file *file, struct address_space *mapping, @@ -157,6 +187,8 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, + .readpages = hpfs_readpages, + .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, .bmap = _hpfs_bmap diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b7ae286..1b39863 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -27,8 +27,9 @@ #define ALLOC_FWD_MAX 128 #define ALLOC_M 1 #define FNODE_RD_AHEAD 16 -#define ANODE_RD_AHEAD 16 -#define DNODE_RD_AHEAD 4 +#define ANODE_RD_AHEAD 0 +#define DNODE_RD_AHEAD 72 +#define COUNT_RD_AHEAD 62 #define FREE_DNODES_ADD 58 #define FREE_DNODES_DEL 29 @@ -207,6 +208,7 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno); /* buffer.c */ +void hpfs_prefetch_sectors(struct super_block *, unsigned, int); void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int); @@ -271,6 +273,7 @@ void hpfs_evict_inode(struct inode *); __le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); __le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); +void hpfs_prefetch_bitmap(struct super_block *, unsigned); unsigned char *hpfs_load_code_page(struct super_block *, secno); __le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp); struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index 4acb19d..3aa66ae 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -17,7 +17,9 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, struct quad_buffer_head *qbh, char *id) { secno sec; - if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) { + __le32 *ret; + unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; + if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) { hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id); return NULL; } @@ -26,7 +28,23 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id); return NULL; } - return hpfs_map_4sectors(s, sec, qbh, 4); + ret = hpfs_map_4sectors(s, sec, qbh, 4); + if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1); + return ret; +} + +void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block) +{ + unsigned to_prefetch, next_prefetch; + unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; + if (unlikely(bmp_block >= n_bands)) + return; + to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]); + if (unlikely(bmp_block + 1 >= n_bands)) + next_prefetch = 0; + else + next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]); + hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch)); } /* diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a0617e7..4334cda 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -121,7 +121,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) unsigned long *bits; unsigned count; - bits = hpfs_map_4sectors(s, secno, &qbh, 4); + bits = hpfs_map_4sectors(s, secno, &qbh, 0); if (!bits) return 0; count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); @@ -134,8 +134,13 @@ static unsigned count_bitmaps(struct super_block *s) unsigned n, count, n_bands; n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; count = 0; - for (n = 0; n < n_bands; n++) + for (n = 0; n < COUNT_RD_AHEAD; n++) { + hpfs_prefetch_bitmap(s, n); + } + for (n = 0; n < n_bands; n++) { + hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); + } return count; } @@ -558,7 +563,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_cp_table = NULL; sbi->sb_c_bitmap = -1; sbi->sb_max_fwd_alloc = 0xffffff; - + + if (sbi->sb_fs_size >= 0x80000000) { + hpfs_error(s, "invalid size in superblock: %08x", + (unsigned)sbi->sb_fs_size); + goto bail4; + } + /* Load bitmap directory */ if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) goto bail4; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index cd3e389..4338ff3 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra) struct dentry *parent; char *root, *name; const char *seg_name; - int len, seg_len; + int len, seg_len, root_len; len = 0; parent = dentry; @@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra) } root = "proc"; - len += strlen(root); + root_len = strlen(root); + len += root_len; name = kmalloc(len + extra + 1, GFP_KERNEL); if (name == NULL) return NULL; @@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra) while (parent->d_parent != parent) { if (is_pid(parent)) { seg_name = "pid"; - seg_len = strlen("pid"); + seg_len = strlen(seg_name); } else { seg_name = parent->d_name.name; @@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra) len -= seg_len + 1; name[len] = '/'; - strncpy(&name[len + 1], seg_name, seg_len); + memcpy(&name[len + 1], seg_name, seg_len); parent = parent->d_parent; } - strncpy(name, root, strlen(root)); + memcpy(name, root, root_len); return name; } @@ -542,8 +543,8 @@ static const struct file_operations hppfs_file_fops = { }; struct hppfs_dirent { - void *vfs_dirent; - filldir_t filldir; + struct dir_context ctx; + struct dir_context *caller; struct dentry *dentry; }; @@ -555,34 +556,29 @@ static int hppfs_filldir(void *d, const char *name, int size, if (file_removed(dirent->dentry, name)) return 0; - return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset, - inode, type); + dirent->caller->pos = dirent->ctx.pos; + return !dir_emit(dirent->caller, name, size, inode, type); } -static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) +static int hppfs_readdir(struct file *file, struct dir_context *ctx) { struct hppfs_private *data = file->private_data; struct file *proc_file = data->proc_file; - int (*readdir)(struct file *, void *, filldir_t); - struct hppfs_dirent dirent = ((struct hppfs_dirent) - { .vfs_dirent = ent, - .filldir = filldir, - .dentry = file->f_path.dentry - }); + struct hppfs_dirent d = { + .ctx.actor = hppfs_filldir, + .caller = ctx, + .dentry = file->f_path.dentry + }; int err; - - readdir = file_inode(proc_file)->i_fop->readdir; - - proc_file->f_pos = file->f_pos; - err = (*readdir)(proc_file, &dirent, hppfs_filldir); - file->f_pos = proc_file->f_pos; - + proc_file->f_pos = ctx->pos; + err = iterate_dir(proc_file, &d.ctx); + ctx->pos = d.ctx.pos; return err; } static const struct file_operations hppfs_dir_fops = { .owner = NULL, - .readdir = hppfs_readdir, + .iterate = hppfs_readdir, .open = hppfs_dir_open, .llseek = default_llseek, .release = hppfs_release, @@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink); */ void inc_nlink(struct inode *inode) { - if (WARN_ON(inode->i_nlink == 0)) + if (unlikely(inode->i_nlink == 0)) { + WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); + } inode->__i_nlink++; } diff --git a/fs/internal.h b/fs/internal.h index eaa75f7..7c5f01c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -96,11 +96,12 @@ struct open_flags { umode_t mode; int acc_mode; int intent; + int lookup_flags; }; extern struct file *do_filp_open(int dfd, struct filename *pathname, - const struct open_flags *op, int flags); + const struct open_flags *op); extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, - const char *, const struct open_flags *, int lookup_flags); + const char *, const struct open_flags *); extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); @@ -130,6 +131,13 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); * read_write.c */ extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); +extern int rw_verify_area(int, struct file *, const loff_t *, size_t); + +/* + * splice.c + */ +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); /* * pipe.c diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index a7d5c3c..b943cbd 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de, /* * This should _really_ be cleaned up some day.. */ -static int do_isofs_readdir(struct inode *inode, struct file *filp, - void *dirent, filldir_t filldir, +static int do_isofs_readdir(struct inode *inode, struct file *file, + struct dir_context *ctx, char *tmpname, struct iso_directory_record *tmpde) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); @@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, struct iso_directory_record *de; struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); - offset = filp->f_pos & (bufsize - 1); - block = filp->f_pos >> bufbits; + offset = ctx->pos & (bufsize - 1); + block = ctx->pos >> bufbits; - while (filp->f_pos < inode->i_size) { + while (ctx->pos < inode->i_size) { int de_len; if (!bh) { @@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, de = (struct iso_directory_record *) (bh->b_data + offset); - de_len = *(unsigned char *) de; + de_len = *(unsigned char *)de; /* * If the length byte is zero, we should move on to the next @@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, if (de_len == 0) { brelse(bh); bh = NULL; - filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); - block = filp->f_pos >> bufbits; + ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); + block = ctx->pos >> bufbits; offset = 0; continue; } @@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, if (de->flags[-sbi->s_high_sierra] & 0x80) { first_de = 0; - filp->f_pos += de_len; + ctx->pos += de_len; continue; } first_de = 1; /* Handle the case of the '.' directory */ if (de->name_len[0] == 1 && de->name[0] == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) + if (!dir_emit_dot(file, ctx)) break; - filp->f_pos += de_len; + ctx->pos += de_len; continue; } @@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, /* Handle the case of the '..' directory */ if (de->name_len[0] == 1 && de->name[0] == 1) { - inode_number = parent_ino(filp->f_path.dentry); - if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0) + if (!dir_emit_dotdot(file, ctx)) break; - filp->f_pos += de_len; + ctx->pos += de_len; continue; } @@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) || (!sbi->s_showassoc && (de->flags[-sbi->s_high_sierra] & 4))) { - filp->f_pos += de_len; + ctx->pos += de_len; continue; } @@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, } } if (len > 0) { - if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0) + if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN)) break; } - filp->f_pos += de_len; + ctx->pos += de_len; continue; } @@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, * handling split directory entries.. The real work is done by * "do_isofs_readdir()". */ -static int isofs_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int isofs_readdir(struct file *file, struct dir_context *ctx) { int result; char *tmpname; struct iso_directory_record *tmpde; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); tmpname = (char *)__get_free_page(GFP_KERNEL); if (tmpname == NULL) @@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp, tmpde = (struct iso_directory_record *) (tmpname+1024); - result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); + result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde); free_page((unsigned long) tmpname); return result; @@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = isofs_readdir, + .iterate = isofs_readdir, }; /* diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index d9b8aeb..c348d6d 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -28,31 +28,23 @@ #define BEQUIET -static int isofs_hashi(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr); -static int isofs_hash(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr); +static int isofs_hashi(const struct dentry *parent, struct qstr *qstr); +static int isofs_hash(const struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int isofs_dentry_cmp(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #ifdef CONFIG_JOLIET -static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr); -static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode, - struct qstr *qstr); +static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr); +static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi_ms(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); static int isofs_dentry_cmp_ms(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, + const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); #endif @@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common( } static int -isofs_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +isofs_hash(const struct dentry *dentry, struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 0); } static int -isofs_hashi(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +isofs_hashi(const struct dentry *dentry, struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 0); } static int -isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 0, 0); } static int -isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 0, 1); @@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, #ifdef CONFIG_JOLIET static int -isofs_hash_ms(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { return isofs_hash_common(dentry, qstr, 1); } static int -isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) { return isofs_hashi_common(dentry, qstr, 1); } static int -isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 0); } static int -isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { return isofs_dentry_cmp_common(len, str, name, 1, 1); diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index c167028..9529564 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen) qstr.name = compare; qstr.len = dlen; - return dentry->d_op->d_compare(NULL, NULL, NULL, NULL, - dentry->d_name.len, dentry->d_name.name, &qstr); + return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr); } /* diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index e3e255c..be0c39b 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -2019,16 +2019,20 @@ zap_buffer_unlocked: * void journal_invalidatepage() - invalidate a journal page * @journal: journal to use for flush * @page: page to flush - * @offset: length of page to invalidate. + * @offset: offset of the range to invalidate + * @length: length of the range to invalidate * - * Reap page buffers containing data after offset in page. + * Reap page buffers containing data in specified range in page. */ void journal_invalidatepage(journal_t *journal, struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int may_free = 1; if (!PageLocked(page)) @@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal, if (!page_has_buffers(page)) return; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ @@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal, unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + return; + if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); may_free &= journal_unmap_buffer(journal, bh, - offset > 0); + partial_page); unlock_buffer(bh); } curr_off = next_off; @@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal, } while (bh != head); - if (!offset) { + if (!partial_page) { if (may_free && try_to_free_buffers(page)) J_ASSERT(!page_has_buffers(page)); } diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index 69a48c2..5a9f553 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -20,7 +20,7 @@ config JBD2 config JBD2_DEBUG bool "JBD2 (ext4) debugging support" - depends on JBD2 && DEBUG_FS + depends on JBD2 help If you are using the ext4 journaled file system (or potentially any other filesystem/device using JBD2), this option @@ -29,7 +29,7 @@ config JBD2_DEBUG By default, the debugging output will be turned off. If you select Y here, then you will be able to turn on debugging - with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a + with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a number between 1 and 5. The higher the number, the more debugging output is generated. To turn debugging off again, do - "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". + "echo 0 > /sys/module/jbd2/parameters/jbd2_debug". diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c78841e..7f34f47 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal) int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ - nblocks = jbd_space_needed(journal); - while (__jbd2_log_space_left(journal) < nblocks) { + nblocks = jbd2_space_needed(journal); + while (jbd2_log_space_left(journal) < nblocks) { if (journal->j_flags & JBD2_ABORT) return; write_unlock(&journal->j_state_lock); @@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal) */ write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); - nblocks = jbd_space_needed(journal); - space_left = __jbd2_log_space_left(journal); + nblocks = jbd2_space_needed(journal); + space_left = jbd2_log_space_left(journal); if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; tid_t tid = 0; @@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal) /* We were able to recover space; yay! */ ; } else if (tid) { + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set. So we need to temporarily drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); + write_lock(&journal->j_state_lock); + continue; } else { printk(KERN_ERR "%s: needed %d blocks and " "only had %d space available\n", @@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) __jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); - - /* Just in case anybody was waiting for more transactions to be - checkpointed... */ - wake_up(&journal->j_wait_logspace); ret = 1; out: return ret; @@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); J_ASSERT(transaction->t_forget == NULL); - J_ASSERT(transaction->t_iobuf_list == NULL); J_ASSERT(transaction->t_shadow_list == NULL); - J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(atomic_read(&transaction->t_updates) == 0); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0f53946..559bec1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -30,15 +30,22 @@ #include <trace/events/jbd2.h> /* - * Default IO end handler for temporary BJ_IO buffer_heads. + * IO end handler for temporary buffer_heads handling writes to the journal. */ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { + struct buffer_head *orig_bh = bh->b_private; + BUFFER_TRACE(bh, ""); if (uptodate) set_buffer_uptodate(bh); else clear_buffer_uptodate(bh); + if (orig_bh) { + clear_bit_unlock(BH_Shadow, &orig_bh->b_state); + smp_mb__after_clear_bit(); + wake_up_bit(&orig_bh->b_state, BH_Shadow); + } unlock_buffer(bh); } @@ -85,8 +92,7 @@ nope: __brelse(bh); } -static void jbd2_commit_block_csum_set(journal_t *j, - struct journal_head *descriptor) +static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) { struct commit_header *h; __u32 csum; @@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j, if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; - h = (struct commit_header *)(jh2bh(descriptor)->b_data); + h = (struct commit_header *)(bh->b_data); h->h_chksum_type = 0; h->h_chksum_size = 0; h->h_chksum[0] = 0; - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, - j->j_blocksize); + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); h->h_chksum[0] = cpu_to_be32(csum); } @@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal, struct buffer_head **cbh, __u32 crc32_sum) { - struct journal_head *descriptor; struct commit_header *tmp; struct buffer_head *bh; int ret; @@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal, if (is_journal_aborted(journal)) return 0; - descriptor = jbd2_journal_get_descriptor_buffer(journal); - if (!descriptor) + bh = jbd2_journal_get_descriptor_buffer(journal); + if (!bh) return 1; - bh = jh2bh(descriptor); - tmp = (struct commit_header *)bh->b_data; tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); @@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; tmp->h_chksum[0] = cpu_to_be32(crc32_sum); } - jbd2_commit_block_csum_set(journal, descriptor); + jbd2_commit_block_csum_set(journal, bh); - JBUFFER_TRACE(descriptor, "submit commit block"); + BUFFER_TRACE(bh, "submit commit block"); lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); @@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal, if (unlikely(!buffer_uptodate(bh))) ret = -EIO; put_bh(bh); /* One for getblk() */ - jbd2_journal_put_journal_head(bh2jh(bh)); return ret; } @@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, } static void jbd2_descr_block_csum_set(journal_t *j, - struct journal_head *descriptor) + struct buffer_head *bh) { struct jbd2_journal_block_tail *tail; __u32 csum; @@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j, if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; - tail = (struct jbd2_journal_block_tail *) - (jh2bh(descriptor)->b_data + j->j_blocksize - + tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, - j->j_blocksize); + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); tail->t_checksum = cpu_to_be32(csum); } @@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, { struct page *page = bh->b_page; __u8 *addr; - __u32 csum; + __u32 csum32; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; sequence = cpu_to_be32(sequence); addr = kmap_atomic(page); - csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); - csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), - bh->b_size); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, + sizeof(sequence)); + csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), + bh->b_size); kunmap_atomic(addr); - tag->t_checksum = cpu_to_be32(csum); + /* We only have space to store the lower 16 bits of the crc32c. */ + tag->t_checksum = cpu_to_be16(csum32); } /* * jbd2_journal_commit_transaction @@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) { struct transaction_stats_s stats; transaction_t *commit_transaction; - struct journal_head *jh, *new_jh, *descriptor; + struct journal_head *jh; + struct buffer_head *descriptor; struct buffer_head **wbuf = journal->j_wbuf; int bufs; int flags; @@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) tid_t first_tid; int update_tail; int csum_size = 0; + LIST_HEAD(io_bufs); + LIST_HEAD(log_bufs); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) csum_size = sizeof(struct jbd2_journal_block_tail); @@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; - J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd2_start_commit(journal, commit_transaction); jbd_debug(1, "JBD2: starting commit of transaction %d\n", commit_transaction->t_tid); write_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; trace_jbd2_commit_locking(journal, commit_transaction); @@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + /* + * Reserved credits cannot be claimed anymore, free them + */ + atomic_sub(atomic_read(&journal->j_reserved_credits), + &commit_transaction->t_outstanding_credits); + trace_jbd2_commit_flushing(journal, commit_transaction); stats.run.rs_flushing = jiffies; stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, @@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) wake_up(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); - jbd_debug(3, "JBD2: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2a\n"); /* * Now start flushing things to disk, in the order they appear @@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); jbd2_journal_write_revoke_records(journal, commit_transaction, - WRITE_SYNC); + &log_bufs, WRITE_SYNC); blk_finish_plug(&plug); - jbd_debug(3, "JBD2: commit phase 2\n"); + jbd_debug(3, "JBD2: commit phase 2b\n"); /* * Way to go: we have now written out all of the data for a @@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) atomic_read(&commit_transaction->t_outstanding_credits)); err = 0; - descriptor = NULL; bufs = 0; + descriptor = NULL; blk_start_plug(&plug); while (commit_transaction->t_buffers) { @@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) record the metadata buffer. */ if (!descriptor) { - struct buffer_head *bh; - J_ASSERT (bufs == 0); jbd_debug(4, "JBD2: get descriptor\n"); @@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal) continue; } - bh = jh2bh(descriptor); jbd_debug(4, "JBD2: got buffer %llu (%p)\n", - (unsigned long long)bh->b_blocknr, bh->b_data); - header = (journal_header_t *)&bh->b_data[0]; + (unsigned long long)descriptor->b_blocknr, + descriptor->b_data); + header = (journal_header_t *)descriptor->b_data; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); header->h_sequence = cpu_to_be32(commit_transaction->t_tid); - tagp = &bh->b_data[sizeof(journal_header_t)]; - space_left = bh->b_size - sizeof(journal_header_t); + tagp = &descriptor->b_data[sizeof(journal_header_t)]; + space_left = descriptor->b_size - + sizeof(journal_header_t); first_tag = 1; - set_buffer_jwrite(bh); - set_buffer_dirty(bh); - wbuf[bufs++] = bh; + set_buffer_jwrite(descriptor); + set_buffer_dirty(descriptor); + wbuf[bufs++] = descriptor; /* Record it so that we can wait for IO completion later */ - BUFFER_TRACE(bh, "ph3: file as descriptor"); - jbd2_journal_file_buffer(descriptor, commit_transaction, - BJ_LogCtl); + BUFFER_TRACE(descriptor, "ph3: file as descriptor"); + jbd2_file_log_bh(&log_bufs, descriptor); } /* Where is the buffer to be written? */ @@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get - rid of the BJ_IO/BJ_Shadow pairing of buffers. */ + rid of the shadow pairing of buffers. */ atomic_inc(&jh2bh(jh)->b_count); - /* Make a temporary IO buffer with which to write it out - (this will requeue both the metadata buffer and the - temporary IO buffer). new_bh goes on BJ_IO*/ - - set_bit(BH_JWrite, &jh2bh(jh)->b_state); /* - * akpm: jbd2_journal_write_metadata_buffer() sets - * new_bh->b_transaction to commit_transaction. - * We need to clean this up before we release new_bh - * (which is of type BJ_IO) + * Make a temporary IO buffer with which to write it out + * (this will requeue the metadata buffer to BJ_Shadow). */ + set_bit(BH_JWrite, &jh2bh(jh)->b_state); JBUFFER_TRACE(jh, "ph3: write metadata"); flags = jbd2_journal_write_metadata_buffer(commit_transaction, - jh, &new_jh, blocknr); + jh, &wbuf[bufs], blocknr); if (flags < 0) { jbd2_journal_abort(journal, flags); continue; } - set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); - wbuf[bufs++] = jh2bh(new_jh); + jbd2_file_log_bh(&io_bufs, wbuf[bufs]); /* Record the new block's tag in the current descriptor buffer */ @@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag = (journal_block_tag_t *) tagp; write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be16(tag_flag); - jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), + jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], commit_transaction->t_tid); tagp += tag_bytes; space_left -= tag_bytes; + bufs++; if (first_tag) { memcpy (tagp, journal->j_uuid, 16); @@ -809,7 +810,7 @@ start_journal_io: the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. + the io_bufs list. Wait for the buffers in reverse order. That way we are less likely to be woken up until all IOs have completed, and @@ -818,47 +819,33 @@ start_journal_io: jbd_debug(3, "JBD2: commit phase 3\n"); - /* - * akpm: these are BJ_IO, and j_list_lock is not needed. - * See __journal_try_to_free_buffer. - */ -wait_for_iobuf: - while (commit_transaction->t_iobuf_list != NULL) { - struct buffer_head *bh; + while (!list_empty(&io_bufs)) { + struct buffer_head *bh = list_entry(io_bufs.prev, + struct buffer_head, + b_assoc_buffers); - jh = commit_transaction->t_iobuf_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_iobuf; - } - if (cond_resched()) - goto wait_for_iobuf; + wait_on_buffer(bh); + cond_resched(); if (unlikely(!buffer_uptodate(bh))) err = -EIO; - - clear_buffer_jwrite(bh); - - JBUFFER_TRACE(jh, "ph4: unfile after journal write"); - jbd2_journal_unfile_buffer(journal, jh); + jbd2_unfile_log_bh(bh); /* - * ->t_iobuf_list should contain only dummy buffer_heads - * which were created by jbd2_journal_write_metadata_buffer(). + * The list contains temporary buffer heads created by + * jbd2_journal_write_metadata_buffer(). */ BUFFER_TRACE(bh, "dumping temporary bh"); - jbd2_journal_put_journal_head(jh); __brelse(bh); J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); free_buffer_head(bh); - /* We also have to unlock and free the corresponding - shadowed buffer */ + /* We also have to refile the corresponding shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); - clear_bit(BH_JWrite, &bh->b_state); + clear_buffer_jwrite(bh); J_ASSERT_BH(bh, buffer_jbddirty(bh)); + J_ASSERT_BH(bh, !buffer_shadow(bh)); /* The metadata is now released for reuse, but we need to remember it against this transaction so that when @@ -866,14 +853,6 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* - * Wake up any transactions which were waiting for this IO to - * complete. The barrier must be here so that changes by - * jbd2_journal_file_buffer() take effect before wake_up_bit() - * does the waitqueue check. - */ - smp_mb(); - wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); } @@ -883,26 +862,19 @@ wait_for_iobuf: jbd_debug(3, "JBD2: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ - wait_for_ctlbuf: - while (commit_transaction->t_log_list != NULL) { + while (!list_empty(&log_bufs)) { struct buffer_head *bh; - jh = commit_transaction->t_log_list->b_tprev; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - wait_on_buffer(bh); - goto wait_for_ctlbuf; - } - if (cond_resched()) - goto wait_for_ctlbuf; + bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); + wait_on_buffer(bh); + cond_resched(); if (unlikely(!buffer_uptodate(bh))) err = -EIO; BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); - jbd2_journal_unfile_buffer(journal, jh); - jbd2_journal_put_journal_head(jh); + jbd2_unfile_log_bh(bh); __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } @@ -952,9 +924,7 @@ wait_for_iobuf: J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); - J_ASSERT(commit_transaction->t_iobuf_list == NULL); J_ASSERT(commit_transaction->t_shadow_list == NULL); - J_ASSERT(commit_transaction->t_log_list == NULL); restart_loop: /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9545757..02c7ad9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache); static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); +#ifdef CONFIG_JBD2_DEBUG +void __jbd2_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > jbd2_journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd2_debug); +#endif + /* Checksumming functions */ int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { @@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal) * * If the source buffer has already been modified by a new transaction * since we took the last commit snapshot, we use the frozen copy of - * that data for IO. If we end up using the existing buffer_head's data - * for the write, then we *have* to lock the buffer to prevent anyone - * else from using and possibly modifying it while the IO is in - * progress. + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we have to make sure nobody modifies it while the + * IO is in progress. do_get_write_access() handles this. * - * The function returns a pointer to the buffer_heads to be used for IO. - * - * We assume that the journal has already been locked in this function. + * The function returns a pointer to the buffer_head to be used for IO. + * * * Return value: * <0: Error @@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal) int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, - struct journal_head **jh_out, - unsigned long long blocknr) + struct buffer_head **bh_out, + sector_t blocknr) { int need_copy_out = 0; int done_copy_out = 0; int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; - struct journal_head *new_jh; struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); @@ -368,14 +383,13 @@ retry_alloc: /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); - new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */ + jbd_lock_bh_state(bh_in); +repeat: /* * If a new transaction has already done a buffer copy-out, then * we use that version of the data for the commit. */ - jbd_lock_bh_state(bh_in); -repeat: if (jh_in->b_frozen_data) { done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); @@ -415,7 +429,7 @@ repeat: jbd_unlock_bh_state(bh_in); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); if (!tmp) { - jbd2_journal_put_journal_head(new_jh); + brelse(new_bh); return -ENOMEM; } jbd_lock_bh_state(bh_in); @@ -426,7 +440,7 @@ repeat: jh_in->b_frozen_data = tmp; mapped_data = kmap_atomic(new_page); - memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); + memcpy(tmp, mapped_data + new_offset, bh_in->b_size); kunmap_atomic(mapped_data); new_page = virt_to_page(tmp); @@ -452,14 +466,14 @@ repeat: } set_bh_page(new_bh, new_page, new_offset); - new_jh->b_transaction = NULL; - new_bh->b_size = jh2bh(jh_in)->b_size; - new_bh->b_bdev = transaction->t_journal->j_dev; + new_bh->b_size = bh_in->b_size; + new_bh->b_bdev = journal->j_dev; new_bh->b_blocknr = blocknr; + new_bh->b_private = bh_in; set_buffer_mapped(new_bh); set_buffer_dirty(new_bh); - *jh_out = new_jh; + *bh_out = new_bh; /* * The to-be-written buffer needs to get moved to the io queue, @@ -470,11 +484,9 @@ repeat: spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); spin_unlock(&journal->j_list_lock); + set_buffer_shadow(bh_in); jbd_unlock_bh_state(bh_in); - JBUFFER_TRACE(new_jh, "file as BJ_IO"); - jbd2_journal_file_buffer(new_jh, transaction, BJ_IO); - return do_escape | (done_copy_out << 1); } @@ -484,35 +496,6 @@ repeat: */ /* - * __jbd2_log_space_left: Return the number of free blocks left in the journal. - * - * Called with the journal already locked. - * - * Called under j_state_lock - */ - -int __jbd2_log_space_left(journal_t *journal) -{ - int left = journal->j_free; - - /* assert_spin_locked(&journal->j_state_lock); */ - - /* - * Be pessimistic here about the number of those free blocks which - * might be required for log descriptor control blocks. - */ - -#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ - - left -= MIN_LOG_RESERVED_BLOCKS; - - if (left <= 0) - return 0; - left -= (left >> 3); - return left; -} - -/* * Called with j_state_lock locked for writing. * Returns true if a transaction commit was started. */ @@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid) } /* - * Force and wait upon a commit if the calling process is not within - * transaction. This is used for forcing out undo-protected data which contains - * bitmaps, when the fs is running out of space. - * - * We can only force the running transaction if we don't have an active handle; - * otherwise, we will deadlock. - * - * Returns true if a transaction was started. + * Force and wait any uncommitted transactions. We can only force the running + * transaction if we don't have an active handle, otherwise, we will deadlock. + * Returns: <0 in case of error, + * 0 if nothing to commit, + * 1 if transaction was successfully committed. */ -int jbd2_journal_force_commit_nested(journal_t *journal) +static int __jbd2_journal_force_commit(journal_t *journal) { transaction_t *transaction = NULL; tid_t tid; - int need_to_start = 0; + int need_to_start = 0, ret = 0; read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { @@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction = journal->j_committing_transaction; if (!transaction) { + /* Nothing to commit */ read_unlock(&journal->j_state_lock); - return 0; /* Nothing to retry */ + return 0; } - tid = transaction->t_tid; read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); - jbd2_log_wait_commit(journal, tid); - return 1; + ret = jbd2_log_wait_commit(journal, tid); + if (!ret) + ret = 1; + + return ret; +} + +/** + * Force and wait upon a commit if the calling process is not within + * transaction. This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. + * + * @journal: journal to force + * Returns true if progress was made. + */ +int jbd2_journal_force_commit_nested(journal_t *journal) +{ + int ret; + + ret = __jbd2_journal_force_commit(journal); + return ret > 0; +} + +/** + * int journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * Caller want unconditional commit. We can only force the running transaction + * if we don't have an active handle, otherwise, we will deadlock. + */ +int jbd2_journal_force_commit(journal_t *journal) +{ + int ret; + + J_ASSERT(!current->journal_info); + ret = __jbd2_journal_force_commit(journal); + if (ret > 0) + ret = 0; + return ret; } /* @@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ -struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) +struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) { struct buffer_head *bh; unsigned long long blocknr; @@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "return this buffer"); - return jbd2_journal_add_journal_head(bh); + return bh; } /* @@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void) return NULL; init_waitqueue_head(&journal->j_wait_transaction_locked); - init_waitqueue_head(&journal->j_wait_logspace); init_waitqueue_head(&journal->j_wait_done_commit); - init_waitqueue_head(&journal->j_wait_checkpoint); init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); + init_waitqueue_head(&journal->j_wait_reserved); mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); @@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void) journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; journal->j_max_batch_time = 15000; /* 15ms */ + atomic_set(&journal->j_reserved_credits, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal) static void jbd2_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; + journal_superblock_t *sb = journal->j_superblock; int ret; trace_jbd2_write_superblock(journal, write_op); @@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } + jbd2_superblock_csum_set(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; ret = submit_bh(write_op, bh); @@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal) jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", journal->j_errno); sb->s_errno = cpu_to_be32(journal->j_errno); - jbd2_superblock_csum_set(journal, sb); read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, WRITE_SYNC); @@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void) #ifdef CONFIG_JBD2_DEBUG atomic_inc(&nr_journal_heads); #endif - ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); while (!ret) { yield(); - ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); } } return ret; @@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) struct journal_head *new_jh = NULL; repeat: - if (!buffer_jbd(bh)) { + if (!buffer_jbd(bh)) new_jh = journal_alloc_journal_head(); - memset(new_jh, 0, sizeof(*new_jh)); - } jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 626846b..d4851464 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, void *buf, __u32 sequence) { - __u32 provided, calculated; + __u32 csum32; if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; sequence = cpu_to_be32(sequence); - calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, - sizeof(sequence)); - calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); - provided = be32_to_cpu(tag->t_checksum); + csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, + sizeof(sequence)); + csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); - return provided == cpu_to_be32(calculated); + return tag->t_checksum == cpu_to_be16(csum32); } static int do_one_pass(journal_t *journal, diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index f30b80b..198c9c1 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -122,9 +122,10 @@ struct jbd2_revoke_table_s #ifdef __KERNEL__ static void write_one_revoke_record(journal_t *, transaction_t *, - struct journal_head **, int *, + struct list_head *, + struct buffer_head **, int *, struct jbd2_revoke_record_s *, int); -static void flush_descriptor(journal_t *, struct journal_head *, int, int); +static void flush_descriptor(journal_t *, struct buffer_head *, int, int); #endif /* Utility functions to maintain the revoke table */ @@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) */ void jbd2_journal_write_revoke_records(journal_t *journal, transaction_t *transaction, + struct list_head *log_bufs, int write_op) { - struct journal_head *descriptor; + struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; struct list_head *hash_list; @@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal, while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; - write_one_revoke_record(journal, transaction, + write_one_revoke_record(journal, transaction, log_bufs, &descriptor, &offset, record, write_op); count++; @@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal, static void write_one_revoke_record(journal_t *journal, transaction_t *transaction, - struct journal_head **descriptorp, + struct list_head *log_bufs, + struct buffer_head **descriptorp, int *offsetp, struct jbd2_revoke_record_s *record, int write_op) { int csum_size = 0; - struct journal_head *descriptor; + struct buffer_head *descriptor; int offset; journal_header_t *header; @@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal, descriptor = jbd2_journal_get_descriptor_buffer(journal); if (!descriptor) return; - header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; + header = (journal_header_t *)descriptor->b_data; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); header->h_sequence = cpu_to_be32(transaction->t_tid); /* Record it so that we can wait for IO completion later */ - JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); - jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl); + BUFFER_TRACE(descriptor, "file in log_bufs"); + jbd2_file_log_bh(log_bufs, descriptor); offset = sizeof(jbd2_journal_revoke_header_t); *descriptorp = descriptor; } if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { - * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = + * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); offset += 8; } else { - * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = + * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); offset += 4; } @@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal, *offsetp = offset; } -static void jbd2_revoke_csum_set(journal_t *j, - struct journal_head *descriptor) +static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) { struct jbd2_journal_revoke_tail *tail; __u32 csum; @@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j, if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; - tail = (struct jbd2_journal_revoke_tail *) - (jh2bh(descriptor)->b_data + j->j_blocksize - + tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_revoke_tail)); tail->r_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, - j->j_blocksize); + csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); tail->r_checksum = cpu_to_be32(csum); } @@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j, */ static void flush_descriptor(journal_t *journal, - struct journal_head *descriptor, + struct buffer_head *descriptor, int offset, int write_op) { jbd2_journal_revoke_header_t *header; - struct buffer_head *bh = jh2bh(descriptor); if (is_journal_aborted(journal)) { - put_bh(bh); + put_bh(descriptor); return; } - header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; + header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); jbd2_revoke_csum_set(journal, descriptor); - set_buffer_jwrite(bh); - BUFFER_TRACE(bh, "write"); - set_buffer_dirty(bh); - write_dirty_buffer(bh, write_op); + set_buffer_jwrite(descriptor); + BUFFER_TRACE(descriptor, "write"); + set_buffer_dirty(descriptor); + write_dirty_buffer(descriptor, write_op); } #endif diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 10f524c..7aa9a32 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); atomic_set(&transaction->t_updates, 0); - atomic_set(&transaction->t_outstanding_credits, 0); + atomic_set(&transaction->t_outstanding_credits, + atomic_read(&journal->j_reserved_credits)); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction, } /* + * Wait until running transaction passes T_LOCKED state. Also starts the commit + * if needed. The function expects running transaction to exist and releases + * j_state_lock. + */ +static void wait_transaction_locked(journal_t *journal) + __releases(journal->j_state_lock) +{ + DEFINE_WAIT(wait); + int need_to_start; + tid_t tid = journal->j_running_transaction->t_tid; + + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + need_to_start = !tid_geq(journal->j_commit_request, tid); + read_unlock(&journal->j_state_lock); + if (need_to_start) + jbd2_log_start_commit(journal, tid); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); +} + +static void sub_reserved_credits(journal_t *journal, int blocks) +{ + atomic_sub(blocks, &journal->j_reserved_credits); + wake_up(&journal->j_wait_reserved); +} + +/* + * Wait until we can add credits for handle to the running transaction. Called + * with j_state_lock held for reading. Returns 0 if handle joined the running + * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and + * caller must retry. + */ +static int add_transaction_credits(journal_t *journal, int blocks, + int rsv_blocks) +{ + transaction_t *t = journal->j_running_transaction; + int needed; + int total = blocks + rsv_blocks; + + /* + * If the current transaction is locked down for commit, wait + * for the lock to be released. + */ + if (t->t_state == T_LOCKED) { + wait_transaction_locked(journal); + return 1; + } + + /* + * If there is not enough space left in the log to write all + * potential buffers requested by this operation, we need to + * stall pending a log checkpoint to free some more log space. + */ + needed = atomic_add_return(total, &t->t_outstanding_credits); + if (needed > journal->j_max_transaction_buffers) { + /* + * If the current transaction is already too large, + * then start to commit it: we can then go back and + * attach this handle to a new transaction. + */ + atomic_sub(total, &t->t_outstanding_credits); + wait_transaction_locked(journal); + return 1; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + */ + if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { + atomic_sub(total, &t->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); + if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + return 1; + } + + /* No reservation? We are done... */ + if (!rsv_blocks) + return 0; + + needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); + /* We allow at most half of a transaction to be reserved */ + if (needed > journal->j_max_transaction_buffers / 2) { + sub_reserved_credits(journal, rsv_blocks); + atomic_sub(total, &t->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) + rsv_blocks + <= journal->j_max_transaction_buffers / 2); + return 1; + } + return 0; +} + +/* * start_this_handle: Given a handle, deal with any locking or stalling * needed to make sure that there is enough journal space for the handle * to begin. Attach the handle to a transaction and set up the @@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle, gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; - tid_t tid; - int needed, need_to_start; - int nblocks = handle->h_buffer_credits; + int blocks = handle->h_buffer_credits; + int rsv_blocks = 0; unsigned long ts = jiffies; - if (nblocks > journal->j_max_transaction_buffers) { + /* + * 1/2 of transaction can be reserved so we can practically handle + * only 1/2 of maximum transaction size per operation + */ + if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) { printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", - current->comm, nblocks, - journal->j_max_transaction_buffers); + current->comm, blocks, + journal->j_max_transaction_buffers / 2); return -ENOSPC; } + if (handle->h_rsv_handle) + rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + alloc_transaction: if (!journal->j_running_transaction) { new_transaction = kmem_cache_zalloc(transaction_cache, @@ -199,8 +312,12 @@ repeat: return -EROFS; } - /* Wait on the journal's transaction barrier if necessary */ - if (journal->j_barrier_count) { + /* + * Wait on the journal's transaction barrier if necessary. Specifically + * we allow reserved handles to proceed because otherwise commit could + * deadlock on page writeback not being able to complete. + */ + if (!handle->h_reserved && journal->j_barrier_count) { read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_transaction_locked, journal->j_barrier_count == 0); @@ -213,7 +330,7 @@ repeat: goto alloc_transaction; write_lock(&journal->j_state_lock); if (!journal->j_running_transaction && - !journal->j_barrier_count) { + (handle->h_reserved || !journal->j_barrier_count)) { jbd2_get_transaction(journal, new_transaction); new_transaction = NULL; } @@ -223,85 +340,18 @@ repeat: transaction = journal->j_running_transaction; - /* - * If the current transaction is locked down for commit, wait for the - * lock to be released. - */ - if (transaction->t_state == T_LOCKED) { - DEFINE_WAIT(wait); - - prepare_to_wait(&journal->j_wait_transaction_locked, - &wait, TASK_UNINTERRUPTIBLE); - read_unlock(&journal->j_state_lock); - schedule(); - finish_wait(&journal->j_wait_transaction_locked, &wait); - goto repeat; - } - - /* - * If there is not enough space left in the log to write all potential - * buffers requested by this operation, we need to stall pending a log - * checkpoint to free some more log space. - */ - needed = atomic_add_return(nblocks, - &transaction->t_outstanding_credits); - - if (needed > journal->j_max_transaction_buffers) { + if (!handle->h_reserved) { + /* We may have dropped j_state_lock - restart in that case */ + if (add_transaction_credits(journal, blocks, rsv_blocks)) + goto repeat; + } else { /* - * If the current transaction is already too large, then start - * to commit it: we can then go back and attach this handle to - * a new transaction. + * We have handle reserved so we are allowed to join T_LOCKED + * transaction and we don't have to check for transaction size + * and journal space. */ - DEFINE_WAIT(wait); - - jbd_debug(2, "Handle %p starting new commit...\n", handle); - atomic_sub(nblocks, &transaction->t_outstanding_credits); - prepare_to_wait(&journal->j_wait_transaction_locked, &wait, - TASK_UNINTERRUPTIBLE); - tid = transaction->t_tid; - need_to_start = !tid_geq(journal->j_commit_request, tid); - read_unlock(&journal->j_state_lock); - if (need_to_start) - jbd2_log_start_commit(journal, tid); - schedule(); - finish_wait(&journal->j_wait_transaction_locked, &wait); - goto repeat; - } - - /* - * The commit code assumes that it can get enough log space - * without forcing a checkpoint. This is *critical* for - * correctness: a checkpoint of a buffer which is also - * associated with a committing transaction creates a deadlock, - * so commit simply cannot force through checkpoints. - * - * We must therefore ensure the necessary space in the journal - * *before* starting to dirty potentially checkpointed buffers - * in the new transaction. - * - * The worst part is, any transaction currently committing can - * reduce the free space arbitrarily. Be careful to account for - * those buffers when checkpointing. - */ - - /* - * @@@ AKPM: This seems rather over-defensive. We're giving commit - * a _lot_ of headroom: 1/4 of the journal plus the size of - * the committing transaction. Really, we only need to give it - * committing_transaction->t_outstanding_credits plus "enough" for - * the log control blocks. - * Also, this test is inconsistent with the matching one in - * jbd2_journal_extend(). - */ - if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { - jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); - atomic_sub(nblocks, &transaction->t_outstanding_credits); - read_unlock(&journal->j_state_lock); - write_lock(&journal->j_state_lock); - if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) - __jbd2_log_wait_for_space(journal); - write_unlock(&journal->j_state_lock); - goto repeat; + sub_reserved_credits(journal, blocks); + handle->h_reserved = 0; } /* OK, account for the buffers that this operation expects to @@ -309,15 +359,16 @@ repeat: */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; - handle->h_requested_credits = nblocks; + handle->h_requested_credits = blocks; handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); - jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", - handle, nblocks, + jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", + handle, blocks, atomic_read(&transaction->t_outstanding_credits), - __jbd2_log_space_left(journal)); + jbd2_log_space_left(journal)); read_unlock(&journal->j_state_lock); + current->journal_info = handle; lock_map_acquire(&handle->h_lockdep_map); jbd2_journal_free_transaction(new_transaction); @@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks) * * We make sure that the transaction can guarantee at least nblocks of * modified buffers in the log. We block until the log can guarantee - * that much space. - * - * This function is visible to journal users (like ext3fs), so is not - * called with the journal already locked. + * that much space. Additionally, if rsv_blocks > 0, we also create another + * handle with rsv_blocks reserved blocks in the journal. This handle is + * is stored in h_rsv_handle. It is not attached to any particular transaction + * and thus doesn't block transaction commit. If the caller uses this reserved + * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() + * on the parent handle will dispose the reserved one. Reserved handle has to + * be converted to a normal handle using jbd2_journal_start_reserved() before + * it can be used. * * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, - unsigned int type, unsigned int line_no) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, + gfp_t gfp_mask, unsigned int type, + unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; @@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); + if (rsv_blocks) { + handle_t *rsv_handle; - current->journal_info = handle; + rsv_handle = new_handle(rsv_blocks); + if (!rsv_handle) { + jbd2_free_handle(handle); + return ERR_PTR(-ENOMEM); + } + rsv_handle->h_reserved = 1; + rsv_handle->h_journal = journal; + handle->h_rsv_handle = rsv_handle; + } err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); - current->journal_info = NULL; return ERR_PTR(err); } handle->h_type = type; @@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start); handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { - return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); + return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); +void jbd2_journal_free_reserved(handle_t *handle) +{ + journal_t *journal = handle->h_journal; + + WARN_ON(!handle->h_reserved); + sub_reserved_credits(journal, handle->h_buffer_credits); + jbd2_free_handle(handle); +} +EXPORT_SYMBOL(jbd2_journal_free_reserved); + +/** + * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle + * @handle: handle to start + * + * Start handle that has been previously reserved with jbd2_journal_reserve(). + * This attaches @handle to the running transaction (or creates one if there's + * not transaction running). Unlike jbd2_journal_start() this function cannot + * block on journal commit, checkpointing, or similar stuff. It can block on + * memory allocation or frozen journal though. + * + * Return 0 on success, non-zero on error - handle is freed in that case. + */ +int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, + unsigned int line_no) +{ + journal_t *journal = handle->h_journal; + int ret = -EIO; + + if (WARN_ON(!handle->h_reserved)) { + /* Someone passed in normal handle? Just stop it. */ + jbd2_journal_stop(handle); + return ret; + } + /* + * Usefulness of mixing of reserved and unreserved handles is + * questionable. So far nobody seems to need it so just error out. + */ + if (WARN_ON(current->journal_info)) { + jbd2_journal_free_reserved(handle); + return ret; + } + + handle->h_journal = NULL; + /* + * GFP_NOFS is here because callers are likely from writeback or + * similarly constrained call sites + */ + ret = start_this_handle(journal, handle, GFP_NOFS); + if (ret < 0) + jbd2_journal_free_reserved(handle); + handle->h_type = type; + handle->h_line_no = line_no; + return ret; +} +EXPORT_SYMBOL(jbd2_journal_start_reserved); /** * int jbd2_journal_extend() - extend buffer credits. @@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start); int jbd2_journal_extend(handle_t *handle, int nblocks) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; int result; int wanted; - result = -EIO; + WARN_ON(!transaction); if (is_handle_aborted(handle)) - goto out; + return -EROFS; + journal = transaction->t_journal; result = 1; read_lock(&journal->j_state_lock); /* Don't extend a locked-down transaction! */ - if (handle->h_transaction->t_state != T_RUNNING) { + if (transaction->t_state != T_RUNNING) { jbd_debug(3, "denied handle %p %d blocks: " "transaction not running\n", handle, nblocks); goto error_out; } spin_lock(&transaction->t_handle_lock); - wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; + wanted = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); if (wanted > journal->j_max_transaction_buffers) { jbd_debug(3, "denied handle %p %d blocks: " "transaction too large\n", handle, nblocks); + atomic_sub(nblocks, &transaction->t_outstanding_credits); goto unlock; } - if (wanted > __jbd2_log_space_left(journal)) { + if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) > + jbd2_log_space_left(journal)) { jbd_debug(3, "denied handle %p %d blocks: " "insufficient log space\n", handle, nblocks); + atomic_sub(nblocks, &transaction->t_outstanding_credits); goto unlock; } trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, - handle->h_transaction->t_tid, + transaction->t_tid, handle->h_type, handle->h_line_no, handle->h_buffer_credits, nblocks); handle->h_buffer_credits += nblocks; handle->h_requested_credits += nblocks; - atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); @@ -473,7 +599,6 @@ unlock: spin_unlock(&transaction->t_handle_lock); error_out: read_unlock(&journal->j_state_lock); -out: return result; } @@ -490,19 +615,22 @@ out: * to a running handle, a call to jbd2_journal_restart will commit the * handle's transaction so far and reattach the handle to a new * transaction capabable of guaranteeing the requested number of - * credits. + * credits. We preserve reserved handle if there's any attached to the + * passed in handle. */ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; tid_t tid; int need_to_start, ret; + WARN_ON(!transaction); /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) return 0; + journal = transaction->t_journal; /* * First unlink the handle from its current transaction, and start the @@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) spin_lock(&transaction->t_handle_lock); atomic_sub(handle->h_buffer_credits, &transaction->t_outstanding_credits); + if (handle->h_rsv_handle) { + sub_reserved_credits(journal, + handle->h_rsv_handle->h_buffer_credits); + } if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); + tid = transaction->t_tid; spin_unlock(&transaction->t_handle_lock); + handle->h_transaction = NULL; + current->journal_info = NULL; jbd_debug(2, "restarting handle %p\n", handle); - tid = transaction->t_tid; need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) @@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal) write_lock(&journal->j_state_lock); ++journal->j_barrier_count; + /* Wait until there are no reserved handles */ + if (atomic_read(&journal->j_reserved_credits)) { + write_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_reserved, + atomic_read(&journal->j_reserved_credits) == 0); + write_lock(&journal->j_state_lock); + } + /* Wait until there are no running updates */ while (1) { transaction_t *transaction = journal->j_running_transaction; @@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } +static int sleep_on_shadow_bh(void *word) +{ + io_schedule(); + return 0; +} + /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { struct buffer_head *bh; - transaction_t *transaction; + transaction_t *transaction = handle->h_transaction; journal_t *journal; int error; char *frozen_buffer = NULL; int need_copy = 0; unsigned long start_lock, time_lock; + WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; - - transaction = handle->h_transaction; journal = transaction->t_journal; jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); @@ -754,41 +901,29 @@ repeat: * journaled. If the primary copy is already going to * disk then we cannot do copy-out here. */ - if (jh->b_jlist == BJ_Shadow) { - DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); - wait_queue_head_t *wqh; - - wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); - + if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); jbd_unlock_bh_state(bh); - /* commit wakes up all shadow buffers after IO */ - for ( ; ; ) { - prepare_to_wait(wqh, &wait.wait, - TASK_UNINTERRUPTIBLE); - if (jh->b_jlist != BJ_Shadow) - break; - schedule(); - } - finish_wait(wqh, &wait.wait); + wait_on_bit(&bh->b_state, BH_Shadow, + sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); goto repeat; } - /* Only do the copy if the currently-owning transaction - * still needs it. If it is on the Forget list, the - * committing transaction is past that stage. The - * buffer had better remain locked during the kmalloc, - * but that should be true --- we hold the journal lock - * still and the buffer is already on the BUF_JOURNAL - * list so won't be flushed. + /* + * Only do the copy if the currently-owning transaction still + * needs it. If buffer isn't on BJ_Metadata list, the + * committing transaction is past that stage (here we use the + * fact that BH_Shadow is set under bh_state lock together with + * refiling to BJ_Shadow list and at this point we know the + * buffer doesn't have BH_Shadow set). * * Subtle point, though: if this is a get_undo_access, * then we will be relying on the frozen_data to contain * the new value of the committed_data record after the * transaction, so we HAVE to force the frozen_data copy - * in that case. */ - - if (jh->b_jlist != BJ_Forget || force_copy) { + * in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); @@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; struct journal_head *jh = jbd2_journal_add_journal_head(bh); int err; jbd_debug(5, "journal_head %p\n", jh); + WARN_ON(!transaction); err = -EROFS; if (is_handle_aborted(handle)) goto out; + journal = transaction->t_journal; err = 0; JBUFFER_TRACE(jh, "entry"); @@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; struct journal_head *jh; int ret = 0; + WARN_ON(!transaction); if (is_handle_aborted(handle)) - goto out; + return -EROFS; + journal = transaction->t_journal; jh = jbd2_journal_grab_journal_head(bh); if (!jh) { ret = -EUCLEAN; @@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); + __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); @@ -1258,12 +1397,17 @@ out: int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; struct journal_head *jh; int drop_reserve = 0; int err = 0; int was_modified = 0; + WARN_ON(!transaction); + if (is_handle_aborted(handle)) + return -EROFS; + journal = transaction->t_journal; + BUFFER_TRACE(bh, "entry"); jbd_lock_bh_state(bh); @@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) */ jh->b_modified = 0; - if (jh->b_transaction == handle->h_transaction) { + if (jh->b_transaction == transaction) { J_ASSERT_JH(jh, !jh->b_frozen_data); /* If we are forgetting a buffer which is already part @@ -1385,19 +1529,21 @@ drop: int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - int err, wait_for_commit = 0; + journal_t *journal; + int err = 0, wait_for_commit = 0; tid_t tid; pid_t pid; + if (!transaction) + goto free_and_exit; + journal = transaction->t_journal; + J_ASSERT(journal_current_handle() == handle); if (is_handle_aborted(handle)) err = -EIO; - else { + else J_ASSERT(atomic_read(&transaction->t_updates) > 0); - err = 0; - } if (--handle->h_ref > 0) { jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, @@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle) jbd_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, - handle->h_transaction->t_tid, + transaction->t_tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, handle->h_sync, handle->h_requested_credits, @@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle) lock_map_release(&handle->h_lockdep_map); + if (handle->h_rsv_handle) + jbd2_journal_free_reserved(handle->h_rsv_handle); +free_and_exit: jbd2_free_handle(handle); return err; } -/** - * int jbd2_journal_force_commit() - force any uncommitted transactions - * @journal: journal to force - * - * For synchronous operations: force any uncommitted transactions - * to disk. May seem kludgy, but it reuses all the handle batching - * code in a very simple manner. - */ -int jbd2_journal_force_commit(journal_t *journal) -{ - handle_t *handle; - int ret; - - handle = jbd2_journal_start(journal, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - } else { - handle->h_sync = 1; - ret = jbd2_journal_stop(handle); - } - return ret; -} - /* * * List management code snippets: various functions for manipulating the @@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of - * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, - * t_log_list or t_reserved_list. If the caller is holding onto a copy of one - * of these pointers, it could go bad. Generally the caller needs to re-read - * the pointer from the transaction_t. + * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or + * t_reserved_list. If the caller is holding onto a copy of one of these + * pointers, it could go bad. Generally the caller needs to re-read the + * pointer from the transaction_t. * * Called under j_list_lock. */ @@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) case BJ_Forget: list = &transaction->t_forget; break; - case BJ_IO: - list = &transaction->t_iobuf_list; - break; case BJ_Shadow: list = &transaction->t_shadow_list; break; - case BJ_LogCtl: - list = &transaction->t_log_list; - break; case BJ_Reserved: list = &transaction->t_reserved_list; break; @@ -2034,18 +2154,23 @@ zap_buffer_unlocked: * void jbd2_journal_invalidatepage() * @journal: journal to use for flush... * @page: page to flush - * @offset: length of page to invalidate. + * @offset: start of the range to invalidate + * @length: length of the range to invalidate * - * Reap page buffers containing data after offset in page. Can return -EBUSY - * if buffers are part of the committing transaction and the page is straddling - * i_size. Caller then has to wait for current commit and try again. + * Reap page buffers containing data after in the specified range in page. + * Can return -EBUSY if buffers are part of the committing transaction and + * the page is straddling i_size. Caller then has to wait for current commit + * and try again. */ int jbd2_journal_invalidatepage(journal_t *journal, struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int may_free = 1; int ret = 0; @@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal, if (!page_has_buffers(page)) return 0; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ @@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal, unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + return 0; + if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - ret = journal_unmap_buffer(journal, bh, offset > 0); + ret = journal_unmap_buffer(journal, bh, partial_page); unlock_buffer(bh); if (ret < 0) return ret; @@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal, } while (bh != head); - if (!offset) { + if (!partial_page) { if (may_free && try_to_free_buffers(page)) J_ASSERT(!page_has_buffers(page)); } @@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, case BJ_Forget: list = &transaction->t_forget; break; - case BJ_IO: - list = &transaction->t_iobuf_list; - break; case BJ_Shadow: list = &transaction->t_shadow_list; break; - case BJ_LogCtl: - list = &transaction->t_log_list; - break; case BJ_Reserved: list = &transaction->t_reserved_list; break; @@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) { transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; + journal_t *journal; + WARN_ON(!transaction); if (is_handle_aborted(handle)) - return -EIO; + return -EROFS; + journal = transaction->t_journal; jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index acd46a4..e3aac22 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -22,7 +22,7 @@ #include <linux/time.h> #include "nodelist.h" -static int jffs2_readdir (struct file *, void *, filldir_t); +static int jffs2_readdir (struct file *, struct dir_context *); static int jffs2_create (struct inode *,struct dentry *,umode_t, bool); @@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *, const struct file_operations jffs2_dir_operations = { .read = generic_read_dir, - .readdir = jffs2_readdir, + .iterate = jffs2_readdir, .unlocked_ioctl=jffs2_ioctl, .fsync = jffs2_fsync, .llseek = generic_file_llseek, @@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, /***********************************************************************/ -static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int jffs2_readdir(struct file *file, struct dir_context *ctx) { - struct jffs2_inode_info *f; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_full_dirent *fd; - unsigned long offset, curofs; + unsigned long curofs = 1; - jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", - file_inode(filp)->i_ino); + jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino); - f = JFFS2_INODE_INFO(inode); - - offset = filp->f_pos; - - if (offset == 0) { - jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino); - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - offset++; - } - if (offset == 1) { - unsigned long pino = parent_ino(filp->f_path.dentry); - jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino); - if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0) - goto out; - offset++; - } + if (!dir_emit_dots(file, ctx)) + return 0; - curofs=1; mutex_lock(&f->sem); for (fd = f->dents; fd; fd = fd->next) { - curofs++; - /* First loop: curofs = 2; offset = 2 */ - if (curofs < offset) { + /* First loop: curofs = 2; pos = 2 */ + if (curofs < ctx->pos) { jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n", - fd->name, fd->ino, fd->type, curofs, offset); + fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos); continue; } if (!fd->ino) { jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n", fd->name); - offset++; + ctx->pos++; continue; } jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n", - offset, fd->name, fd->ino, fd->type); - if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0) + (unsigned long)ctx->pos, fd->name, fd->ino, fd->type); + if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type)) break; - offset++; + ctx->pos++; } mutex_unlock(&f->sem); - out: - filp->f_pos = offset; return 0; } diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 0ddbece..9f4ed13 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) * return: offset = (pn, index) of start entry * of next jfs_readdir()/dtRead() */ -int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +int jfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *ip = file_inode(filp); + struct inode *ip = file_inode(file); struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ @@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) int overflow, fix_page, page_fixed = 0; static int unique_pos = 2; /* If we can't fix broken index */ - if (filp->f_pos == DIREND) + if (ctx->pos == DIREND) return 0; if (DO_INDEX(ip)) { @@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) */ do_index = 1; - dir_index = (u32) filp->f_pos; + dir_index = (u32) ctx->pos; if (dir_index > 1) { struct dir_table_slot dirtab_slot; @@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (dtEmpty(ip) || (dir_index >= JFS_IP(ip)->next_index)) { /* Stale position. Directory has shrunk */ - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } repeat: rc = read_index(ip, dir_index, &dirtab_slot); if (rc) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return rc; } if (dirtab_slot.flag == DIR_INDEX_FREE) { if (loop_count++ > JFS_IP(ip)->next_index) { jfs_err("jfs_readdir detected " "infinite loop!"); - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } dir_index = le32_to_cpu(dirtab_slot.addr2); if (dir_index == -1) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } goto repeat; @@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) index = dirtab_slot.slot; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); - filp->f_pos = -1; + ctx->pos = -1; return 0; } } else { @@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* * self "." */ - filp->f_pos = 0; - if (filldir(dirent, ".", 1, 0, ip->i_ino, - DT_DIR)) + ctx->pos = 0; + if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ - filp->f_pos = 1; - if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR)) + ctx->pos = 1; + if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; /* * Find first entry of left-most leaf */ if (dtEmpty(ip)) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } @@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ - dtpos = filp->f_pos; + dtpos = ctx->pos; if (dtpos == 0) { /* build "." entry */ - - if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino, - DT_DIR)) + if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; dtoffset->index = 1; - filp->f_pos = dtpos; + ctx->pos = dtpos; } if (dtoffset->pn == 0) { if (dtoffset->index == 1) { /* build ".." entry */ - - if (filldir(dirent, "..", 2, filp->f_pos, - PARENT(ip), DT_DIR)) + if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; } else { jfs_err("jfs_readdir called with " @@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } dtoffset->pn = 1; dtoffset->index = 0; - filp->f_pos = dtpos; + ctx->pos = dtpos; } if (dtEmpty(ip)) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } - if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) { + if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { jfs_err("jfs_readdir: unexpected rc = %d " "from dtReadNext", rc); - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } /* get start leaf page and index */ @@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* offset beyond directory eof ? */ if (bn < 0) { - filp->f_pos = DIREND; + ctx->pos = DIREND; return 0; } } @@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (dirent_buf == 0) { DT_PUTPAGE(mp); jfs_warn("jfs_readdir: __get_free_page failed!"); - filp->f_pos = DIREND; + ctx->pos = DIREND; return -ENOMEM; } @@ -3295,9 +3290,9 @@ skip_one: jfs_dirent = (struct jfs_dirent *) dirent_buf; while (jfs_dirents--) { - filp->f_pos = jfs_dirent->position; - if (filldir(dirent, jfs_dirent->name, - jfs_dirent->name_len, filp->f_pos, + ctx->pos = jfs_dirent->position; + if (!dir_emit(ctx, jfs_dirent->name, + jfs_dirent->name_len, jfs_dirent->ino, DT_UNKNOWN)) goto out; jfs_dirent = next_jfs_dirent(jfs_dirent); @@ -3309,7 +3304,7 @@ skip_one: } if (!overflow && (bn == 0)) { - filp->f_pos = DIREND; + ctx->pos = DIREND; break; } diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h index 2545bb3..fd4169e 100644 --- a/fs/jfs/jfs_dtree.h +++ b/fs/jfs/jfs_dtree.h @@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag); -extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir); +extern int jfs_readdir(struct file *file, struct dir_context *ctx); #endif /* !_H_JFS_DTREE */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index c57499d..360d27c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2009,7 +2009,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - submit_bio(READ_SYNC, bio); + /*check if journaling to disk has been disabled*/ + if (log->no_integrity) { + bio->bi_size = 0; + lbmIODone(bio, 0); + } else { + submit_bio(READ_SYNC, bio); + } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 6740d34..9e3aaff 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) return ret; } -static void metapage_invalidatepage(struct page *page, unsigned long offset) +static void metapage_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - BUG_ON(offset); + BUG_ON(offset || length < PAGE_CACHE_SIZE); BUG_ON(PageWriteback(page)); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 3b91a7a..8b19027 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = { const struct file_operations jfs_dir_operations = { .read = generic_read_dir, - .readdir = jfs_readdir, + .iterate = jfs_readdir, .fsync = jfs_fsync, .unlocked_ioctl = jfs_ioctl, #ifdef CONFIG_COMPAT @@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = { .llseek = generic_file_llseek, }; -static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, - struct qstr *this) +static int jfs_ci_hash(const struct dentry *dir, struct qstr *this) { unsigned long hash; int i; @@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, return 0; } -static int jfs_ci_compare(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { int i, result = 1; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2003e83..788e0a9 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -611,11 +611,28 @@ static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; + int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { txQuiesce(sb); - lmLogShutdown(log); - updateSuper(sb, FM_CLEAN); + rc = lmLogShutdown(log); + if (rc) { + jfs_error(sb, "jfs_freeze: lmLogShutdown failed"); + + /* let operations fail rather than hang */ + txResume(sb); + + return rc; + } + rc = updateSuper(sb, FM_CLEAN); + if (rc) { + jfs_err("jfs_freeze: updateSuper failed\n"); + /* + * Don't fail here. Everything succeeded except + * marking the superblock clean, so there's really + * no harm in leaving it frozen for now. + */ + } } return 0; } @@ -627,13 +644,18 @@ static int jfs_unfreeze(struct super_block *sb) int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { - updateSuper(sb, FM_MOUNT); - if ((rc = lmLogInit(log))) - jfs_err("jfs_unlock failed with return code %d", rc); - else - txResume(sb); + rc = updateSuper(sb, FM_MOUNT); + if (rc) { + jfs_error(sb, "jfs_unfreeze: updateSuper failed"); + goto out; + } + rc = lmLogInit(log); + if (rc) + jfs_error(sb, "jfs_unfreeze: lmLogInit failed"); +out: + txResume(sb); } - return 0; + return rc; } static struct dentry *jfs_do_mount(struct file_system_type *fs_type, @@ -135,60 +135,40 @@ static inline unsigned char dt_type(struct inode *inode) * both impossible due to the lock on directory. */ -int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) +int dcache_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct dentry *cursor = filp->private_data; + struct dentry *dentry = file->f_path.dentry; + struct dentry *cursor = file->private_data; struct list_head *p, *q = &cursor->d_u.d_child; - ino_t ino; - int i = filp->f_pos; - switch (i) { - case 0: - ino = dentry->d_inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - break; - filp->f_pos++; - i++; - /* fallthrough */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - break; - filp->f_pos++; - i++; - /* fallthrough */ - default: - spin_lock(&dentry->d_lock); - if (filp->f_pos == 2) - list_move(q, &dentry->d_subdirs); - - for (p=q->next; p != &dentry->d_subdirs; p=p->next) { - struct dentry *next; - next = list_entry(p, struct dentry, d_u.d_child); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - if (!simple_positive(next)) { - spin_unlock(&next->d_lock); - continue; - } + if (!dir_emit_dots(file, ctx)) + return 0; + spin_lock(&dentry->d_lock); + if (ctx->pos == 2) + list_move(q, &dentry->d_subdirs); + + for (p = q->next; p != &dentry->d_subdirs; p = p->next) { + struct dentry *next = list_entry(p, struct dentry, d_u.d_child); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (!simple_positive(next)) { + spin_unlock(&next->d_lock); + continue; + } - spin_unlock(&next->d_lock); - spin_unlock(&dentry->d_lock); - if (filldir(dirent, next->d_name.name, - next->d_name.len, filp->f_pos, - next->d_inode->i_ino, - dt_type(next->d_inode)) < 0) - return 0; - spin_lock(&dentry->d_lock); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - /* next is still alive */ - list_move(q, p); - spin_unlock(&next->d_lock); - p = q; - filp->f_pos++; - } - spin_unlock(&dentry->d_lock); + spin_unlock(&next->d_lock); + spin_unlock(&dentry->d_lock); + if (!dir_emit(ctx, next->d_name.name, next->d_name.len, + next->d_inode->i_ino, dt_type(next->d_inode))) + return 0; + spin_lock(&dentry->d_lock); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + /* next is still alive */ + list_move(q, p); + spin_unlock(&next->d_lock); + p = q; + ctx->pos++; } + spin_unlock(&dentry->d_lock); return 0; } @@ -202,7 +182,7 @@ const struct file_operations simple_dir_operations = { .release = dcache_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, - .readdir = dcache_readdir, + .iterate = dcache_readdir, .fsync = noop_fsync, }; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index a2aa97d..10d6c41 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv) svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; - nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); + nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); printk(KERN_WARNING diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index e703318..067778b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block) dprintk("lockd: unlinking block %p...\n", block); /* Remove block from list */ - status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl); + status = posix_unblock_lock(&block->b_call->a_args.lock.fl); nlmsvc_remove_block(block); return status; } @@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2) return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; } +/* + * Since NLM uses two "keys" for tracking locks, we need to hash them down + * to one for the blocked_hash. Here, we're just xor'ing the host address + * with the pid in order to create a key value for picking a hash bucket. + */ +static unsigned long +nlmsvc_owner_key(struct file_lock *fl) +{ + return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid; +} + const struct lock_manager_operations nlmsvc_lock_operations = { .lm_compare_owner = nlmsvc_same_owner, + .lm_owner_key = nlmsvc_owner_key, .lm_notify = nlmsvc_notify_blocked, .lm_grant = nlmsvc_grant_deferred, }; diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 97e8741..dc5c759 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, again: file->f_locks = 0; - lock_flocks(); /* protects i_flock list */ + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl; fl = fl->fl_next) { if (fl->fl_lmops != &nlmsvc_lock_operations) continue; @@ -181,7 +181,7 @@ again: if (match(lockhost, host)) { struct file_lock lock = *fl; - unlock_flocks(); + spin_unlock(&inode->i_lock); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; @@ -193,7 +193,7 @@ again: goto again; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); return 0; } @@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file) if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl; fl = fl->fl_next) { if (fl->fl_lmops == &nlmsvc_lock_operations) { - unlock_flocks(); + spin_unlock(&inode->i_lock); return 1; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); file->f_locks = 0; return 0; } @@ -126,6 +126,9 @@ #include <linux/time.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> +#include <linux/hashtable.h> +#include <linux/percpu.h> +#include <linux/lglock.h> #include <asm/uaccess.h> @@ -153,30 +156,53 @@ int lease_break_time = 45; #define for_each_lock(inode, lockp) \ for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) -static LIST_HEAD(file_lock_list); -static LIST_HEAD(blocked_list); -static DEFINE_SPINLOCK(file_lock_lock); +/* + * The global file_lock_list is only used for displaying /proc/locks, so we + * keep a list on each CPU, with each list protected by its own spinlock via + * the file_lock_lglock. Note that alterations to the list also require that + * the relevant i_lock is held. + */ +DEFINE_STATIC_LGLOCK(file_lock_lglock); +static DEFINE_PER_CPU(struct hlist_head, file_lock_list); /* - * Protects the two list heads above, plus the inode->i_flock list + * The blocked_hash is used to find POSIX lock loops for deadlock detection. + * It is protected by blocked_lock_lock. + * + * We hash locks by lockowner in order to optimize searching for the lock a + * particular lockowner is waiting on. + * + * FIXME: make this value scale via some heuristic? We generally will want more + * buckets when we have more lockowners holding locks, but that's a little + * difficult to determine without knowing what the workload will look like. */ -void lock_flocks(void) -{ - spin_lock(&file_lock_lock); -} -EXPORT_SYMBOL_GPL(lock_flocks); +#define BLOCKED_HASH_BITS 7 +static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); -void unlock_flocks(void) -{ - spin_unlock(&file_lock_lock); -} -EXPORT_SYMBOL_GPL(unlock_flocks); +/* + * This lock protects the blocked_hash. Generally, if you're accessing it, you + * want to be holding this lock. + * + * In addition, it also protects the fl->fl_block list, and the fl->fl_next + * pointer for file_lock structures that are acting as lock requests (in + * contrast to those that are acting as records of acquired locks). + * + * Note that when we acquire this lock in order to change the above fields, + * we often hold the i_lock as well. In certain cases, when reading the fields + * protected by this lock, we can skip acquiring it iff we already hold the + * i_lock. + * + * In particular, adding an entry to the fl_block list requires that you hold + * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting + * an entry from the list however only requires the file_lock_lock. + */ +static DEFINE_SPINLOCK(blocked_lock_lock); static struct kmem_cache *filelock_cache __read_mostly; static void locks_init_lock_heads(struct file_lock *fl) { - INIT_LIST_HEAD(&fl->fl_link); + INIT_HLIST_NODE(&fl->fl_link); INIT_LIST_HEAD(&fl->fl_block); init_waitqueue_head(&fl->fl_wait); } @@ -210,7 +236,7 @@ void locks_free_lock(struct file_lock *fl) { BUG_ON(waitqueue_active(&fl->fl_wait)); BUG_ON(!list_empty(&fl->fl_block)); - BUG_ON(!list_empty(&fl->fl_link)); + BUG_ON(!hlist_unhashed(&fl->fl_link)); locks_release_private(fl); kmem_cache_free(filelock_cache, fl); @@ -484,47 +510,118 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) return fl1->fl_owner == fl2->fl_owner; } +/* Must be called with the i_lock held! */ +static inline void +locks_insert_global_locks(struct file_lock *fl) +{ + lg_local_lock(&file_lock_lglock); + fl->fl_link_cpu = smp_processor_id(); + hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); + lg_local_unlock(&file_lock_lglock); +} + +/* Must be called with the i_lock held! */ +static inline void +locks_delete_global_locks(struct file_lock *fl) +{ + /* + * Avoid taking lock if already unhashed. This is safe since this check + * is done while holding the i_lock, and new insertions into the list + * also require that it be held. + */ + if (hlist_unhashed(&fl->fl_link)) + return; + lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu); + hlist_del_init(&fl->fl_link); + lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu); +} + +static unsigned long +posix_owner_key(struct file_lock *fl) +{ + if (fl->fl_lmops && fl->fl_lmops->lm_owner_key) + return fl->fl_lmops->lm_owner_key(fl); + return (unsigned long)fl->fl_owner; +} + +static inline void +locks_insert_global_blocked(struct file_lock *waiter) +{ + hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); +} + +static inline void +locks_delete_global_blocked(struct file_lock *waiter) +{ + hash_del(&waiter->fl_link); +} + /* Remove waiter from blocker's block list. * When blocker ends up pointing to itself then the list is empty. + * + * Must be called with blocked_lock_lock held. */ static void __locks_delete_block(struct file_lock *waiter) { + locks_delete_global_blocked(waiter); list_del_init(&waiter->fl_block); - list_del_init(&waiter->fl_link); waiter->fl_next = NULL; } -/* - */ -void locks_delete_block(struct file_lock *waiter) +static void locks_delete_block(struct file_lock *waiter) { - lock_flocks(); + spin_lock(&blocked_lock_lock); __locks_delete_block(waiter); - unlock_flocks(); + spin_unlock(&blocked_lock_lock); } -EXPORT_SYMBOL(locks_delete_block); /* Insert waiter into blocker's block list. * We use a circular list so that processes can be easily woken up in * the order they blocked. The documentation doesn't require this but * it seems like the reasonable thing to do. + * + * Must be called with both the i_lock and blocked_lock_lock held. The fl_block + * list itself is protected by the file_lock_list, but by ensuring that the + * i_lock is also held on insertions we can avoid taking the blocked_lock_lock + * in some cases when we see that the fl_block list is empty. */ -static void locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter) +static void __locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) { BUG_ON(!list_empty(&waiter->fl_block)); - list_add_tail(&waiter->fl_block, &blocker->fl_block); waiter->fl_next = blocker; + list_add_tail(&waiter->fl_block, &blocker->fl_block); if (IS_POSIX(blocker)) - list_add(&waiter->fl_link, &blocked_list); + locks_insert_global_blocked(waiter); } -/* Wake up processes blocked waiting for blocker. - * If told to wait then schedule the processes until the block list - * is empty, otherwise empty the block list ourselves. +/* Must be called with i_lock held. */ +static void locks_insert_block(struct file_lock *blocker, + struct file_lock *waiter) +{ + spin_lock(&blocked_lock_lock); + __locks_insert_block(blocker, waiter); + spin_unlock(&blocked_lock_lock); +} + +/* + * Wake up processes blocked waiting for blocker. + * + * Must be called with the inode->i_lock held! */ static void locks_wake_up_blocks(struct file_lock *blocker) { + /* + * Avoid taking global lock if list is empty. This is safe since new + * blocked requests are only added to the list under the i_lock, and + * the i_lock is always held here. Note that removal from the fl_block + * list does not require the i_lock, so we must recheck list_empty() + * after acquiring the blocked_lock_lock. + */ + if (list_empty(&blocker->fl_block)) + return; + + spin_lock(&blocked_lock_lock); while (!list_empty(&blocker->fl_block)) { struct file_lock *waiter; @@ -536,20 +633,23 @@ static void locks_wake_up_blocks(struct file_lock *blocker) else wake_up(&waiter->fl_wait); } + spin_unlock(&blocked_lock_lock); } /* Insert file lock fl into an inode's lock list at the position indicated * by pos. At the same time add the lock to the global file lock list. + * + * Must be called with the i_lock held! */ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) { - list_add(&fl->fl_link, &file_lock_list); - fl->fl_nspid = get_pid(task_tgid(current)); /* insert into file's list */ fl->fl_next = *pos; *pos = fl; + + locks_insert_global_locks(fl); } /* @@ -557,14 +657,17 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) * Wake up processes that are blocked waiting for this lock, * notify the FS that the lock has been cleared and * finally free the lock. + * + * Must be called with the i_lock held! */ static void locks_delete_lock(struct file_lock **thisfl_p) { struct file_lock *fl = *thisfl_p; + locks_delete_global_locks(fl); + *thisfl_p = fl->fl_next; fl->fl_next = NULL; - list_del_init(&fl->fl_link); if (fl->fl_nspid) { put_pid(fl->fl_nspid); @@ -625,8 +728,9 @@ void posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; + struct inode *inode = file_inode(filp); - lock_flocks(); + spin_lock(&inode->i_lock); for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { if (!IS_POSIX(cfl)) continue; @@ -639,7 +743,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) fl->fl_pid = pid_vnr(cfl->fl_nspid); } else fl->fl_type = F_UNLCK; - unlock_flocks(); + spin_unlock(&inode->i_lock); return; } EXPORT_SYMBOL(posix_test_lock); @@ -676,13 +780,14 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) { struct file_lock *fl; - list_for_each_entry(fl, &blocked_list, fl_link) { + hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) { if (posix_same_owner(fl, block_fl)) return fl->fl_next; } return NULL; } +/* Must be called with the blocked_lock_lock held! */ static int posix_locks_deadlock(struct file_lock *caller_fl, struct file_lock *block_fl) { @@ -718,7 +823,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) return -ENOMEM; } - lock_flocks(); + spin_lock(&inode->i_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -748,9 +853,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) { - unlock_flocks(); + spin_unlock(&inode->i_lock); cond_resched(); - lock_flocks(); + spin_lock(&inode->i_lock); } find_conflict: @@ -777,7 +882,7 @@ find_conflict: error = 0; out: - unlock_flocks(); + spin_unlock(&inode->i_lock); if (new_fl) locks_free_lock(new_fl); return error; @@ -791,7 +896,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str struct file_lock *left = NULL; struct file_lock *right = NULL; struct file_lock **before; - int error, added = 0; + int error; + bool added = false; /* * We may need two file_lock structures for this operation, @@ -806,7 +912,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str new_fl2 = locks_alloc_lock(); } - lock_flocks(); + spin_lock(&inode->i_lock); + /* + * New lock request. Walk all POSIX locks and look for conflicts. If + * there are any, either return error or put the request on the + * blocker's list of waiters and the global blocked_hash. + */ if (request->fl_type != F_UNLCK) { for_each_lock(inode, before) { fl = *before; @@ -819,11 +930,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str error = -EAGAIN; if (!(request->fl_flags & FL_SLEEP)) goto out; + /* + * Deadlock detection and insertion into the blocked + * locks list must be done while holding the same lock! + */ error = -EDEADLK; - if (posix_locks_deadlock(request, fl)) - goto out; - error = FILE_LOCK_DEFERRED; - locks_insert_block(fl, request); + spin_lock(&blocked_lock_lock); + if (likely(!posix_locks_deadlock(request, fl))) { + error = FILE_LOCK_DEFERRED; + __locks_insert_block(fl, request); + } + spin_unlock(&blocked_lock_lock); goto out; } } @@ -845,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str before = &fl->fl_next; } - /* Process locks with this owner. */ + /* Process locks with this owner. */ while ((fl = *before) && posix_same_owner(request, fl)) { /* Detect adjacent or overlapping regions (if same lock type) */ @@ -880,7 +997,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str continue; } request = fl; - added = 1; + added = true; } else { /* Processing for different lock types is a bit @@ -891,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (fl->fl_start > request->fl_end) break; if (request->fl_type == F_UNLCK) - added = 1; + added = true; if (fl->fl_start < request->fl_start) left = fl; /* If the next lock in the list has a higher end @@ -921,7 +1038,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_release_private(fl); locks_copy_private(fl, request); request = fl; - added = 1; + added = true; } } /* Go on to next lock. @@ -931,10 +1048,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str } /* - * The above code only modifies existing locks in case of - * merging or replacing. If new lock(s) need to be inserted - * all modifications are done bellow this, so it's safe yet to - * bail out. + * The above code only modifies existing locks in case of merging or + * replacing. If new lock(s) need to be inserted all modifications are + * done below this, so it's safe yet to bail out. */ error = -ENOLCK; /* "no luck" */ if (right && left == right && !new_fl2) @@ -974,7 +1090,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_wake_up_blocks(left); } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); /* * Free any unused locks. */ @@ -1049,14 +1165,14 @@ int locks_mandatory_locked(struct inode *inode) /* * Search the lock list for this inode for any POSIX locks. */ - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; if (fl->fl_owner != owner) break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return fl ? -EAGAIN : 0; } @@ -1199,7 +1315,7 @@ int __break_lease(struct inode *inode, unsigned int mode) if (IS_ERR(new_fl)) return PTR_ERR(new_fl); - lock_flocks(); + spin_lock(&inode->i_lock); time_out_leases(inode); @@ -1249,11 +1365,11 @@ restart: break_time++; } locks_insert_block(flock, new_fl); - unlock_flocks(); + spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); - lock_flocks(); - __locks_delete_block(new_fl); + spin_lock(&inode->i_lock); + locks_delete_block(new_fl); if (error >= 0) { if (error == 0) time_out_leases(inode); @@ -1270,7 +1386,7 @@ restart: } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); locks_free_lock(new_fl); return error; } @@ -1323,9 +1439,10 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; + struct inode *inode = file_inode(filp); int type = F_UNLCK; - lock_flocks(); + spin_lock(&inode->i_lock); time_out_leases(file_inode(filp)); for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { @@ -1334,11 +1451,11 @@ int fcntl_getlease(struct file *filp) break; } } - unlock_flocks(); + spin_unlock(&inode->i_lock); return type; } -int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) +static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; @@ -1351,7 +1468,7 @@ int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; if ((arg == F_WRLCK) - && ((dentry->d_count > 1) + && ((d_count(dentry) > 1) || (atomic_read(&inode->i_count) > 1))) goto out; @@ -1403,7 +1520,7 @@ out: return error; } -int generic_delete_lease(struct file *filp, struct file_lock **flp) +static int generic_delete_lease(struct file *filp, struct file_lock **flp) { struct file_lock *fl, **before; struct dentry *dentry = filp->f_path.dentry; @@ -1428,7 +1545,7 @@ int generic_delete_lease(struct file *filp, struct file_lock **flp) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). * - * Called with file_lock_lock held. + * Called with inode->i_lock held. */ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) { @@ -1497,11 +1614,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) { + struct inode *inode = file_inode(filp); int error; - lock_flocks(); + spin_lock(&inode->i_lock); error = __vfs_setlease(filp, arg, lease); - unlock_flocks(); + spin_unlock(&inode->i_lock); return error; } @@ -1519,6 +1637,7 @@ static int do_fcntl_delete_lease(struct file *filp) static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { struct file_lock *fl, *ret; + struct inode *inode = file_inode(filp); struct fasync_struct *new; int error; @@ -1532,10 +1651,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) return -ENOMEM; } ret = fl; - lock_flocks(); + spin_lock(&inode->i_lock); error = __vfs_setlease(filp, arg, &ret); if (error) { - unlock_flocks(); + spin_unlock(&inode->i_lock); locks_free_lock(fl); goto out_free_fasync; } @@ -1552,7 +1671,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) new = NULL; error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - unlock_flocks(); + spin_unlock(&inode->i_lock); out_free_fasync: if (new) @@ -2076,7 +2195,7 @@ void locks_remove_flock(struct file *filp) fl.fl_ops->fl_release_private(&fl); } - lock_flocks(); + spin_lock(&inode->i_lock); before = &inode->i_flock; while ((fl = *before) != NULL) { @@ -2094,30 +2213,28 @@ void locks_remove_flock(struct file *filp) } before = &fl->fl_next; } - unlock_flocks(); + spin_unlock(&inode->i_lock); } /** * posix_unblock_lock - stop waiting for a file lock - * @filp: how the file was opened * @waiter: the lock which was waiting * * lockd needs to block waiting for locks. */ int -posix_unblock_lock(struct file *filp, struct file_lock *waiter) +posix_unblock_lock(struct file_lock *waiter) { int status = 0; - lock_flocks(); + spin_lock(&blocked_lock_lock); if (waiter->fl_next) __locks_delete_block(waiter); else status = -ENOENT; - unlock_flocks(); + spin_unlock(&blocked_lock_lock); return status; } - EXPORT_SYMBOL(posix_unblock_lock); /** @@ -2140,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock); #include <linux/proc_fs.h> #include <linux/seq_file.h> +struct locks_iterator { + int li_cpu; + loff_t li_pos; +}; + static void lock_get_status(struct seq_file *f, struct file_lock *fl, loff_t id, char *pfx) { @@ -2213,37 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, static int locks_show(struct seq_file *f, void *v) { + struct locks_iterator *iter = f->private; struct file_lock *fl, *bfl; - fl = list_entry(v, struct file_lock, fl_link); + fl = hlist_entry(v, struct file_lock, fl_link); - lock_get_status(f, fl, *((loff_t *)f->private), ""); + lock_get_status(f, fl, iter->li_pos, ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) - lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); + lock_get_status(f, bfl, iter->li_pos, " ->"); return 0; } static void *locks_start(struct seq_file *f, loff_t *pos) { - loff_t *p = f->private; + struct locks_iterator *iter = f->private; - lock_flocks(); - *p = (*pos + 1); - return seq_list_start(&file_lock_list, *pos); + iter->li_pos = *pos + 1; + lg_global_lock(&file_lock_lglock); + spin_lock(&blocked_lock_lock); + return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) { - loff_t *p = f->private; - ++*p; - return seq_list_next(v, &file_lock_list, pos); + struct locks_iterator *iter = f->private; + + ++iter->li_pos; + return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos); } static void locks_stop(struct seq_file *f, void *v) { - unlock_flocks(); + spin_unlock(&blocked_lock_lock); + lg_global_unlock(&file_lock_lglock); } static const struct seq_operations locks_seq_operations = { @@ -2255,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = { static int locks_open(struct inode *inode, struct file *filp) { - return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); + return seq_open_private(filp, &locks_seq_operations, + sizeof(struct locks_iterator)); } static const struct file_operations proc_locks_operations = { @@ -2290,7 +2417,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len) { struct file_lock *fl; int result = 1; - lock_flocks(); + + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (IS_POSIX(fl)) { if (fl->fl_type == F_RDLCK) @@ -2307,7 +2435,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len) result = 0; break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return result; } @@ -2330,7 +2458,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len) { struct file_lock *fl; int result = 1; - lock_flocks(); + + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (IS_POSIX(fl)) { if ((fl->fl_end < start) || (fl->fl_start > (start + len))) @@ -2345,7 +2474,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len) result = 0; break; } - unlock_flocks(); + spin_unlock(&inode->i_lock); return result; } @@ -2353,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write); static int __init filelock_init(void) { + int i; + filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + lg_lock_init(&file_lock_lglock, "file_lock_lglock"); + + for_each_possible_cpu(i) + INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i)); + return 0; } diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index b827510..6bdc347 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry) /* FIXME: readdir currently has it's own dir_walk code. I don't see a good * way to combine the two copies */ -#define IMPLICIT_NODES 2 -static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) +static int logfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); - loff_t pos = file->f_pos - IMPLICIT_NODES; + loff_t pos; struct page *page; struct logfs_disk_dentry *dd; - int full; + if (ctx->pos < 0) + return -EINVAL; + + if (!dir_emit_dots(file, ctx)) + return 0; + + pos = ctx->pos - 2; BUG_ON(pos < 0); - for (;; pos++) { + for (;; pos++, ctx->pos++) { + bool full; if (beyond_eof(dir, pos)) break; if (!logfs_exist_block(dir, pos)) { @@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) dd = kmap(page); BUG_ON(dd->namelen == 0); - full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), - pos, be64_to_cpu(dd->ino), dd->type); + full = !dir_emit(ctx, (char *)dd->name, + be16_to_cpu(dd->namelen), + be64_to_cpu(dd->ino), dd->type); kunmap(page); page_cache_release(page); if (full) break; } - - file->f_pos = pos + IMPLICIT_NODES; return 0; } -static int logfs_readdir(struct file *file, void *buf, filldir_t filldir) -{ - struct inode *inode = file_inode(file); - ino_t pino = parent_ino(file->f_dentry); - int err; - - if (file->f_pos < 0) - return -EINVAL; - - if (file->f_pos == 0) { - if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0) - return 0; - file->f_pos++; - } - if (file->f_pos == 1) { - if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0) - return 0; - file->f_pos++; - } - - err = __logfs_readdir(file, buf, filldir); - return err; -} - static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) { dd->namelen = cpu_to_be16(name->len); @@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = { const struct file_operations logfs_dir_fops = { .fsync = logfs_fsync, .unlocked_ioctl = logfs_ioctl, - .readdir = logfs_readdir, + .iterate = logfs_readdir, .read = generic_read_dir, .llseek = default_llseek, }; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index c2219a6..57914fc 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc) return __logfs_writepage(page); } -static void logfs_invalidatepage(struct page *page, unsigned long offset) +static void logfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct logfs_block *block = logfs_block(page); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 038da09..d448a77 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb) return area; } -static void map_invalidatepage(struct page *page, unsigned long l) +static void map_invalidatepage(struct page *page, unsigned int o, + unsigned int l) { return; } diff --git a/fs/minix/dir.c b/fs/minix/dir.c index a9ed6f3..dfaf6fa 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -16,12 +16,12 @@ typedef struct minix_dir_entry minix_dirent; typedef struct minix3_dir_entry minix3_dirent; -static int minix_readdir(struct file *, void *, filldir_t); +static int minix_readdir(struct file *, struct dir_context *); const struct file_operations minix_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = minix_readdir, + .iterate = minix_readdir, .fsync = generic_file_fsync, }; @@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) return (void*)((char*)de + sbi->s_dirsize); } -static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int minix_readdir(struct file *file, struct dir_context *ctx) { - unsigned long pos = filp->f_pos; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - unsigned offset = pos & ~PAGE_CACHE_MASK; - unsigned long n = pos >> PAGE_CACHE_SHIFT; - unsigned long npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(sb); unsigned chunk_size = sbi->s_dirsize; - char *name; - __u32 inumber; + unsigned long npages = dir_pages(inode); + unsigned long pos = ctx->pos; + unsigned offset; + unsigned long n; - pos = (pos + chunk_size-1) & ~(chunk_size-1); + ctx->pos = pos = ALIGN(pos, chunk_size); if (pos >= inode->i_size) - goto done; + return 0; + + offset = pos & ~PAGE_CACHE_MASK; + n = pos >> PAGE_CACHE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; @@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) p = kaddr+offset; limit = kaddr + minix_last_byte(inode, n) - chunk_size; for ( ; p <= limit; p = minix_next_entry(p, sbi)) { + const char *name; + __u32 inumber; if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)p; name = de3->name; @@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) inumber = de->inode; } if (inumber) { - int over; - unsigned l = strnlen(name, sbi->s_namelen); - offset = p - kaddr; - over = filldir(dirent, name, l, - (n << PAGE_CACHE_SHIFT) | offset, - inumber, DT_UNKNOWN); - if (over) { + if (!dir_emit(ctx, name, l, + inumber, DT_UNKNOWN)) { dir_put_page(page); - goto done; + return 0; } } + ctx->pos += chunk_size; } dir_put_page(page); } - -done: - filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; return 0; } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 0db73d9..cd950e2 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, return error; } +static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int error; + struct inode *inode = minix_new_inode(dir, mode, &error); + if (inode) { + minix_set_inode(inode, 0); + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + } + return error; +} + static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = { .mknod = minix_mknod, .rename = minix_rename, .getattr = minix_getattr, + .tmpfile = minix_tmpfile, }; @@ -1352,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode); + dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (!dentry) goto unlazy; @@ -1787,8 +1787,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, - &this); + err = parent->d_op->d_hash(parent, &this); if (err < 0) break; } @@ -1976,7 +1975,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!nd->inode->i_op->lookup) { + if (!can_lookup(nd->inode)) { path_put(&nd->path); err = -ENOTDIR; } @@ -2121,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, base->d_inode, &this); + int err = base->d_op->d_hash(base, &this); if (err < 0) return ERR_PTR(err); } @@ -2690,28 +2689,10 @@ static int do_last(struct nameidata *nd, struct path *path, nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; - switch (nd->last_type) { - case LAST_DOTDOT: - case LAST_DOT: + if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); if (error) return error; - /* fallthrough */ - case LAST_ROOT: - error = complete_walk(nd); - if (error) - return error; - audit_inode(name, nd->path.dentry, 0); - if (open_flag & O_CREAT) { - error = -EISDIR; - goto out; - } - goto finish_open; - case LAST_BIND: - error = complete_walk(nd); - if (error) - return error; - audit_inode(name, dir, 0); goto finish_open; } @@ -2841,19 +2822,19 @@ finish_lookup: } nd->inode = inode; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ +finish_open: error = complete_walk(nd); if (error) { path_put(&save_parent); return error; } + audit_inode(name, nd->path.dentry, 0); error = -EISDIR; if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) + if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) goto out; - audit_inode(name, nd->path.dentry, 0); -finish_open: if (!S_ISREG(nd->inode->i_mode)) will_truncate = false; @@ -2920,6 +2901,67 @@ stale_open: goto retry_lookup; } +static int do_tmpfile(int dfd, struct filename *pathname, + struct nameidata *nd, int flags, + const struct open_flags *op, + struct file *file, int *opened) +{ + static const struct qstr name = QSTR_INIT("/", 1); + struct dentry *dentry, *child; + struct inode *dir; + int error = path_lookupat(dfd, pathname->name, + flags | LOOKUP_DIRECTORY, nd); + if (unlikely(error)) + return error; + error = mnt_want_write(nd->path.mnt); + if (unlikely(error)) + goto out; + /* we want directory to be writable */ + error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC); + if (error) + goto out2; + dentry = nd->path.dentry; + dir = dentry->d_inode; + if (!dir->i_op->tmpfile) { + error = -EOPNOTSUPP; + goto out2; + } + child = d_alloc(dentry, &name); + if (unlikely(!child)) { + error = -ENOMEM; + goto out2; + } + nd->flags &= ~LOOKUP_DIRECTORY; + nd->flags |= op->intent; + dput(nd->path.dentry); + nd->path.dentry = child; + error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode); + if (error) + goto out2; + audit_inode(pathname, nd->path.dentry, 0); + error = may_open(&nd->path, op->acc_mode, op->open_flag); + if (error) + goto out2; + file->f_path.mnt = nd->path.mnt; + error = finish_open(file, nd->path.dentry, NULL, opened); + if (error) + goto out2; + error = open_check_o_direct(file); + if (error) { + fput(file); + } else if (!(op->open_flag & O_EXCL)) { + struct inode *inode = file_inode(file); + spin_lock(&inode->i_lock); + inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + } +out2: + mnt_drop_write(nd->path.mnt); +out: + path_put(&nd->path); + return error; +} + static struct file *path_openat(int dfd, struct filename *pathname, struct nameidata *nd, const struct open_flags *op, int flags) { @@ -2935,6 +2977,11 @@ static struct file *path_openat(int dfd, struct filename *pathname, file->f_flags = op->open_flag; + if (unlikely(file->f_flags & O_TMPFILE)) { + error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); + goto out; + } + error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out; @@ -2987,9 +3034,10 @@ out: } struct file *do_filp_open(int dfd, struct filename *pathname, - const struct open_flags *op, int flags) + const struct open_flags *op) { struct nameidata nd; + int flags = op->lookup_flags; struct file *filp; filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); @@ -3001,17 +3049,16 @@ struct file *do_filp_open(int dfd, struct filename *pathname, } struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *name, const struct open_flags *op, int flags) + const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename filename = { .name = name }; + int flags = op->lookup_flags | LOOKUP_ROOT; nd.root.mnt = mnt; nd.root.dentry = dentry; - flags |= LOOKUP_ROOT; - if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); @@ -3586,12 +3633,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de mutex_lock(&inode->i_mutex); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0) + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else error = dir->i_op->link(old_dentry, dir, new_dentry); + + if (!error && (inode->i_state & I_LINKABLE)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_LINKABLE; + spin_unlock(&inode->i_lock); + } mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 8163260..3be0474 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -23,12 +23,12 @@ #include "ncp_fs.h" -static void ncp_read_volume_list(struct file *, void *, filldir_t, +static void ncp_read_volume_list(struct file *, struct dir_context *, struct ncp_cache_control *); -static void ncp_do_readdir(struct file *, void *, filldir_t, +static void ncp_do_readdir(struct file *, struct dir_context *, struct ncp_cache_control *); -static int ncp_readdir(struct file *, void *, filldir_t); +static int ncp_readdir(struct file *, struct dir_context *); static int ncp_create(struct inode *, struct dentry *, umode_t, bool); static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int); @@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = ncp_readdir, + .iterate = ncp_readdir, .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, @@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations = * Dentry operations routines */ static int ncp_lookup_validate(struct dentry *, unsigned int); -static int ncp_hash_dentry(const struct dentry *, const struct inode *, - struct qstr *); -static int ncp_compare_dentry(const struct dentry *, const struct inode *, - const struct dentry *, const struct inode *, +static int ncp_hash_dentry(const struct dentry *, struct qstr *); +static int ncp_compare_dentry(const struct dentry *, const struct dentry *, unsigned int, const char *, const struct qstr *); static int ncp_delete_dentry(const struct dentry *); @@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i) /* * Note: leave the hash unchanged if the directory * is case-sensitive. + * + * Accessing the parent inode can be racy under RCU pathwalking. + * Use ACCESS_ONCE() to make sure we use _one_ particular inode, + * the callers will handle races. */ static int -ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, - struct qstr *this) +ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) { + struct inode *inode = ACCESS_ONCE(dentry->d_inode); + + if (!inode) + return 0; + if (!ncp_case_sensitive(inode)) { struct super_block *sb = dentry->d_sb; struct nls_table *t; @@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, return 0; } +/* + * Accessing the parent inode can be racy under RCU pathwalking. + * Use ACCESS_ONCE() to make sure we use _one_ particular inode, + * the callers will handle races. + */ static int -ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { + struct inode *pinode; + if (len != name->len) return 1; + pinode = ACCESS_ONCE(parent->d_inode); + if (!pinode) + return 1; + if (ncp_case_sensitive(pinode)) return strncmp(str, name->name, len); @@ -424,9 +440,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry) return ncp_date_dos2unix(i.modifyTime, i.modifyDate); } -static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ncp_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct page *page = NULL; struct ncp_server *server = NCP_SERVER(inode); @@ -440,7 +456,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (int) filp->f_pos); + (int) ctx->pos); result = -EIO; /* Do not generate '.' and '..' when server is dead. */ @@ -448,16 +464,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; result = 0; - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) - goto out; - filp->f_pos = 1; - } - if (filp->f_pos == 1) { - if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR)) - goto out; - filp->f_pos = 2; - } + if (!dir_emit_dots(file, ctx)) + goto out; page = grab_cache_page(&inode->i_data, 0); if (!page) @@ -469,7 +477,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) if (!PageUptodate(page) || !ctl.head.eof) goto init_cache; - if (filp->f_pos == 2) { + if (ctx->pos == 2) { if (jiffies - ctl.head.time >= NCP_MAX_AGE(server)) goto init_cache; @@ -479,10 +487,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) goto init_cache; } - if (filp->f_pos > ctl.head.end) + if (ctx->pos > ctl.head.end) goto finished; - ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2); + ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2); ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE; ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE; @@ -497,21 +505,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) } while (ctl.idx < NCP_DIRCACHE_SIZE) { struct dentry *dent; - int res; + bool over; dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], - dentry, filp->f_pos); + dentry, ctx->pos); if (!dent) goto invalid_cache; - res = filldir(dirent, dent->d_name.name, - dent->d_name.len, filp->f_pos, + over = !dir_emit(ctx, dent->d_name.name, + dent->d_name.len, dent->d_inode->i_ino, DT_UNKNOWN); dput(dent); - if (res) + if (over) goto finished; - filp->f_pos += 1; + ctx->pos += 1; ctl.idx += 1; - if (filp->f_pos > ctl.head.end) + if (ctx->pos > ctl.head.end) goto finished; } if (ctl.page) { @@ -548,9 +556,9 @@ init_cache: ctl.valid = 1; read_really: if (ncp_is_server_root(inode)) { - ncp_read_volume_list(filp, dirent, filldir, &ctl); + ncp_read_volume_list(file, ctx, &ctl); } else { - ncp_do_readdir(filp, dirent, filldir, &ctl); + ncp_do_readdir(file, ctx, &ctl); } ctl.head.end = ctl.fpos - 1; ctl.head.eof = ctl.valid; @@ -573,11 +581,11 @@ out: } static int -ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, +ncp_fill_cache(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, int inval_childs) { - struct dentry *newdent, *dentry = filp->f_path.dentry; + struct dentry *newdent, *dentry = file->f_path.dentry; struct inode *dir = dentry->d_inode; struct ncp_cache_control ctl = *ctrl; struct qstr qname; @@ -666,15 +674,13 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, end_advance: if (!valid) ctl.valid = 0; - if (!ctl.filled && (ctl.fpos == filp->f_pos)) { - if (!ino) - ino = find_inode_number(dentry, &qname); + if (!ctl.filled && (ctl.fpos == ctx->pos)) { if (!ino) ino = iunique(dir->i_sb, 2); - ctl.filled = filldir(dirent, qname.name, qname.len, - filp->f_pos, ino, DT_UNKNOWN); + ctl.filled = !dir_emit(ctx, qname.name, qname.len, + ino, DT_UNKNOWN); if (!ctl.filled) - filp->f_pos += 1; + ctx->pos += 1; } ctl.fpos += 1; ctl.idx += 1; @@ -683,10 +689,10 @@ end_advance: } static void -ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, +ncp_read_volume_list(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctl) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct ncp_server *server = NCP_SERVER(inode); struct ncp_volume_info info; @@ -694,7 +700,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, int i; DPRINTK("ncp_read_volume_list: pos=%ld\n", - (unsigned long) filp->f_pos); + (unsigned long) ctx->pos); for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { int inval_dentry; @@ -715,16 +721,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, } inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL); entry.volume = entry.i.volNumber; - if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry)) + if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry)) return; } } static void -ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, +ncp_do_readdir(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctl) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *dir = dentry->d_inode; struct ncp_server *server = NCP_SERVER(dir); struct nw_search_sequence seq; @@ -736,7 +742,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) filp->f_pos); + (unsigned long) ctx->pos); PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n", dentry->d_name.name, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); @@ -778,7 +784,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, rpl += onerpl; rpls -= onerpl; entry.volume = entry.i.volNumber; - if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0)) + if (!ncp_fill_cache(file, ctx, ctl, &entry, 0)) break; } } while (more); @@ -1029,15 +1035,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) DPRINTK("ncp_rmdir: removing %s/%s\n", dentry->d_parent->d_name.name, dentry->d_name.name); - /* - * fail with EBUSY if there are still references to this - * directory. - */ - dentry_unhash(dentry); - error = -EBUSY; - if (!d_unhashed(dentry)) - goto out; - len = sizeof(__name); error = ncp_io2vol(server, __name, &len, dentry->d_name.name, dentry->d_name.len, !ncp_preserve_case(dir)); @@ -1140,17 +1137,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name); - if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) { - /* - * fail with EBUSY if there are still references to this - * directory. - */ - dentry_unhash(new_dentry); - error = -EBUSY; - if (!d_unhashed(new_dentry)) - goto out; - } - ncp_age_dentry(server, old_dentry); ncp_age_dentry(server, new_dentry); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 26910c8..4659da6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) switch (optval) { case 'u': data->uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->uid)) + if (!uid_valid(data->uid)) { + ret = -EINVAL; goto err; + } break; case 'g': data->gid = make_kgid(current_user_ns(), optint); - if (!gid_valid(data->gid)) + if (!gid_valid(data->gid)) { + ret = -EINVAL; goto err; + } break; case 'o': data->mounted_uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->mounted_uid)) + if (!uid_valid(data->mounted_uid)) { + ret = -EINVAL; goto err; + } break; case 'm': data->file_mode = optint; @@ -891,6 +897,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) if (!server) /* How this could happen? */ goto out; + result = -EPERM; + if (IS_DEADDIR(dentry->d_inode)) + goto out; + /* ageing the dentry to force validation */ ncp_age_dentry(server, dentry); diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index ee24df5..3c5dd55 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; /* we do not support files bigger than 4GB... We eventually supports just 4GB... */ - if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff + if (vma_pages(vma) + vma->vm_pgoff > (1U << (32 - PAGE_SHIFT))) return -EFBIG; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 13ca196..b5e80b0 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -104,6 +104,15 @@ config NFS_V4_1 If unsure, say N. +config NFS_V4_2 + bool "NFS client support for NFSv4.2" + depends on NFS_V4_1 + help + This option enables support for minor version 2 of the NFSv4 protocol + in the kernel's NFS client. + + If unsure, say N. + config PNFS_FILE_LAYOUT tristate depends on NFS_V4_1 @@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN If the NFS client is unchanged from the upstream kernel, this option should be set to the default "kernel.org". +config NFS_V4_SECURITY_LABEL + bool + depends on NFS_V4_2 && SECURITY + default y + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index cce2c05..e0bb048 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ direct.o pagelist.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o \ - dns_resolve.o cache_lib.o + write.o namespace.o mount_clnt.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o @@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o obj-$(CONFIG_NFS_V4) += nfsv4.o nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ - nfs4namespace.o nfs4getroot.o nfs4client.o + nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o +nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 434b93e..e242bbf 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, dev->pgbase = 0; dev->pglen = PAGE_SIZE * max_pages; dev->mincount = 0; + dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); - rc = nfs4_proc_getdeviceinfo(server, dev); + rc = nfs4_proc_getdeviceinfo(server, dev, NULL); dprintk("%s getdevice info returns %d\n", __func__, rc); if (rc) { rv = ERR_PTR(rc); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index cff089a..67cd732 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -211,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, struct svc_rqst *rqstp; int (*callback_svc)(void *vrqstp); struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; - char svc_name[12]; int ret; nfs_callback_bc_serv(minorversion, xprt, serv); @@ -235,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, svc_sock_update_bufs(serv); - sprintf(svc_name, "nfsv4.%u-svc", minorversion); cb_info->serv = serv; cb_info->rqst = rqstp; - cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); + cb_info->task = kthread_run(callback_svc, cb_info->rqst, + "nfsv4.%u-svc", minorversion); if (IS_ERR(cb_info->task)) { ret = PTR_ERR(cb_info->task); svc_exit_thread(cb_info->rqst); @@ -282,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n ret = nfs4_callback_up_net(serv, net); break; case 1: + case 2: ret = nfs41_callback_up_net(serv, net); break; default: diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index efd54f0a..84326e9 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -32,6 +32,8 @@ enum nfs4_callback_opnum { OP_CB_WANTS_CANCELLED = 12, OP_CB_NOTIFY_LOCK = 13, OP_CB_NOTIFY_DEVICEID = 14, +/* Callback operations new to NFSv4.2 */ + OP_CB_OFFLOAD = 15, OP_CB_ILLEGAL = 10044, }; @@ -39,6 +41,7 @@ struct cb_process_state { __be32 drc_status; struct nfs_client *clp; u32 slotid; + u32 minorversion; struct net *net; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index a13d26e..e6ebc4c 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, int i; __be32 status = htonl(NFS4ERR_BADSESSION); - clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid); + clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, + &args->csa_sessionid, cps->minorversion); if (clp == NULL) goto out; @@ -414,7 +415,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ - if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); /* Return NFS4ERR_BADSESSION if we're draining the session diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 59461c9..f4ccfe6 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); hdr->minorversion = ntohl(*p++); - /* Check minor version is zero or one. */ - if (hdr->minorversion <= 1) { - hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ + /* Check for minor version support */ + if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) { + hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */ } else { pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " "illegal minor version %u!\n", @@ -763,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * A single slot, so highest used slotid is either 0 or -1 */ tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(session, tbl); + nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } @@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps) } #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + __be32 status = preprocess_nfs41_op(nop, op_nr, op); + if (status != htonl(NFS4ERR_OP_ILLEGAL)) + return status; + + if (op_nr == OP_CB_OFFLOAD) + return htonl(NFS4ERR_NOTSUPP); + return htonl(NFS4ERR_OP_ILLEGAL); +} +#else /* CONFIG_NFS_V4_2 */ +static __be32 +preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) +{ + return htonl(NFS4ERR_MINOR_VERS_MISMATCH); +} +#endif /* CONFIG_NFS_V4_2 */ + static __be32 preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) { @@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) return htonl(NFS_OK); } -static __be32 process_op(uint32_t minorversion, int nop, - struct svc_rqst *rqstp, +static __be32 process_op(int nop, struct svc_rqst *rqstp, struct xdr_stream *xdr_in, void *argp, struct xdr_stream *xdr_out, void *resp, struct cb_process_state *cps) @@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop, return status; dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", - __func__, minorversion, nop, op_nr); + __func__, cps->minorversion, nop, op_nr); + + switch (cps->minorversion) { + case 0: + status = preprocess_nfs4_op(op_nr, &op); + break; + case 1: + status = preprocess_nfs41_op(nop, op_nr, &op); + break; + case 2: + status = preprocess_nfs42_op(nop, op_nr, &op); + break; + default: + status = htonl(NFS4ERR_MINOR_VERS_MISMATCH); + } - status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) : - preprocess_nfs4_op(op_nr, &op); if (status == htonl(NFS4ERR_OP_ILLEGAL)) op_nr = OP_CB_ILLEGAL; if (status) @@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r return rpc_drop_reply; } + cps.minorversion = hdr_arg.minorversion; hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) return rpc_system_err; while (status == 0 && nops != hdr_arg.nops) { - status = process_op(hdr_arg.minorversion, nops, rqstp, - &xdr_in, argp, &xdr_out, resp, &cps); + status = process_op(nops, rqstp, &xdr_in, + argp, &xdr_out, resp, &cps); nops++; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c513b0c..340b1ef 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -753,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); - if (server->options & NFS_OPTION_MIGRATION) - set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); @@ -1076,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, } if (!(fattr->valid & NFS_ATTR_FATTR)) { - error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr); + error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 57db324..7ec4814 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -73,20 +73,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ if (inode->i_flock == NULL) goto out; - /* Protect inode->i_flock using the file locks lock */ - lock_flocks(); + /* Protect inode->i_flock using the i_lock */ + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; if (nfs_file_open_context(fl->fl_file) != ctx) continue; - unlock_flocks(); + spin_unlock(&inode->i_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); if (status < 0) goto out; - lock_flocks(); + spin_lock(&inode->i_lock); } - unlock_flocks(); + spin_unlock(&inode->i_lock); out: return status; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e093e73..0fac2cb 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -33,6 +33,7 @@ #include <linux/pagevec.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/swap.h> #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> @@ -46,7 +47,7 @@ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); -static int nfs_readdir(struct file *, void *, filldir_t); +static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct page*); @@ -54,7 +55,7 @@ static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .readdir = nfs_readdir, + .iterate = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, @@ -147,6 +148,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int); typedef struct { struct file *file; struct page *page; + struct dir_context *ctx; unsigned long page_index; u64 *dir_cookie; u64 last_cookie; @@ -252,7 +254,7 @@ out: static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { - loff_t diff = desc->file->f_pos - desc->current_index; + loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; if (diff < 0) @@ -289,7 +291,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; - } else if (new_pos < desc->file->f_pos) { + } else if (new_pos < desc->ctx->pos) { if (ctx->duped > 0 && ctx->dup_cookie == *desc->dir_cookie) { if (printk_ratelimit()) { @@ -307,7 +309,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des ctx->dup_cookie = *desc->dir_cookie; ctx->duped = -1; } - desc->file->f_pos = new_pos; + desc->ctx->pos = new_pos; desc->cache_entry_index = i; return 0; } @@ -405,13 +407,13 @@ different: } static -bool nfs_use_readdirplus(struct inode *dir, struct file *filp) +bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) return true; - if (filp->f_pos == 0) + if (ctx->pos == 0) return true; return false; } @@ -435,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct dentry *alias; struct inode *dir = parent->d_inode; struct inode *inode; + int status; if (filename.name[0] == '.') { if (filename.len == 1) @@ -447,7 +450,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) dentry = d_lookup(parent, &filename); if (dentry != NULL) { if (nfs_same_file(dentry, entry)) { - nfs_refresh_inode(dentry->d_inode, entry->fattr); + status = nfs_refresh_inode(dentry->d_inode, entry->fattr); + if (!status) + nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label); goto out; } else { if (d_invalidate(dentry) != 0) @@ -460,7 +465,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) if (dentry == NULL) return; - inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); if (IS_ERR(inode)) goto out; @@ -585,10 +590,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (entry.fh == NULL || entry.fattr == NULL) goto out; + entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(entry.label)) { + status = PTR_ERR(entry.label); + goto out; + } + array = nfs_readdir_get_array(page); if (IS_ERR(array)) { status = PTR_ERR(array); - goto out; + goto out_label_free; } memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; @@ -614,6 +625,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, nfs_readdir_free_large_page(pages_ptr, pages, array_size); out_release_array: nfs_readdir_release_array(page); +out_label_free: + nfs4_label_free(entry.label); out: nfs_free_fattr(entry.fattr); nfs_free_fhandle(entry.fh); @@ -702,8 +715,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) * Once we've found the start of the dirent within a page: fill 'er up... */ static -int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, - filldir_t filldir) +int nfs_do_filldir(nfs_readdir_descriptor_t *desc) { struct file *file = desc->file; int i = 0; @@ -721,13 +733,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, struct nfs_cache_array_entry *ent; ent = &array->array[i]; - if (filldir(dirent, ent->string.name, ent->string.len, - file->f_pos, nfs_compat_user_ino64(ent->ino), - ent->d_type) < 0) { + if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, + nfs_compat_user_ino64(ent->ino), ent->d_type)) { desc->eof = 1; break; } - file->f_pos++; + desc->ctx->pos++; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else @@ -759,8 +770,7 @@ out: * directory in the page cache by the time we get here. */ static inline -int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, - filldir_t filldir) +int uncached_readdir(nfs_readdir_descriptor_t *desc) { struct page *page = NULL; int status; @@ -785,7 +795,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, if (status < 0) goto out_release; - status = nfs_do_filldir(desc, dirent, filldir); + status = nfs_do_filldir(desc); out: dfprintk(DIRCACHE, "NFS: %s: returns %d\n", @@ -800,35 +810,36 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, last cookie cache takes care of the common case of reading the whole directory. */ -static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int nfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; - struct nfs_open_dir_context *dir_ctx = filp->private_data; + struct nfs_open_dir_context *dir_ctx = file->private_data; int res; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (long long)filp->f_pos); + (long long)ctx->pos); nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); /* - * filp->f_pos points to the dirent entry number. + * ctx->pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have * to either find the entry with the appropriate number or * revalidate the cookie. */ memset(desc, 0, sizeof(*desc)); - desc->file = filp; + desc->file = file; + desc->ctx = ctx; desc->dir_cookie = &dir_ctx->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; - desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0; + desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; nfs_block_sillyrename(dentry); - res = nfs_revalidate_mapping(inode, filp->f_mapping); + res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; @@ -840,7 +851,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) /* This means either end of directory */ if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ - res = uncached_readdir(desc, dirent, filldir); + res = uncached_readdir(desc); if (res == 0) continue; } @@ -857,7 +868,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (res < 0) break; - res = nfs_do_filldir(desc, dirent, filldir); + res = nfs_do_filldir(desc); if (res < 0) break; } while (!desc->eof); @@ -1040,6 +1051,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) struct dentry *parent; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; int error; if (flags & LOOKUP_RCU) @@ -1082,7 +1094,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (fhandle == NULL || fattr == NULL) goto out_error; - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(label)) + goto out_error; + + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error) goto out_bad; if (nfs_compare_fh(NFS_FH(inode), fhandle)) @@ -1090,8 +1106,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if ((error = nfs_refresh_inode(inode, fattr)) != 0) goto out_bad; + nfs_setsecurity(inode, fattr, label); + nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); + nfs4_label_free(label); + out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: @@ -1108,6 +1128,7 @@ out_zap_parent: out_bad: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); + nfs4_label_free(label); nfs_mark_for_revalidate(dir); if (inode && S_ISDIR(inode->i_mode)) { /* Purge readdir caches. */ @@ -1128,6 +1149,7 @@ out_zap_parent: out_error: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); + nfs4_label_free(label); dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", __func__, dentry->d_parent->d_name.name, @@ -1256,6 +1278,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in struct inode *inode = NULL; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; int error; dfprintk(VFS, "NFS: lookup(%s/%s)\n", @@ -1282,17 +1305,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (fhandle == NULL || fattr == NULL) goto out; + label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT); + if (IS_ERR(label)) + goto out; + parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error == -ENOENT) goto no_entry; if (error < 0) { res = ERR_PTR(error); goto out_unblock_sillyrename; } - inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); res = ERR_CAST(inode); if (IS_ERR(res)) goto out_unblock_sillyrename; @@ -1310,6 +1337,7 @@ no_entry: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_unblock_sillyrename: nfs_unblock_sillyrename(parent); + nfs4_label_free(label); out: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); @@ -1357,18 +1385,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx, { int err; - if (ctx->dentry != dentry) { - dput(ctx->dentry); - ctx->dentry = dget(dentry); - } - - /* If the open_intent is for execute, we have an extra check to make */ - if (ctx->mode & FMODE_EXEC) { - err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags); - if (err < 0) - goto out; - } - err = finish_open(file, dentry, do_open, opened); if (err) goto out; @@ -1427,13 +1443,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, nfs_block_sillyrename(dentry->d_parent); inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); - d_drop(dentry); + nfs_unblock_sillyrename(dentry->d_parent); if (IS_ERR(inode)) { - nfs_unblock_sillyrename(dentry->d_parent); put_nfs_open_context(ctx); err = PTR_ERR(inode); switch (err) { case -ENOENT: + d_drop(dentry); d_add(dentry, NULL); break; case -EISDIR: @@ -1449,16 +1465,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, } goto out; } - res = d_add_unique(dentry, inode); - if (res != NULL) - dentry = res; - - nfs_unblock_sillyrename(dentry->d_parent); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - - err = nfs_finish_open(ctx, dentry, file, open_flags, opened); - dput(res); + err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); out: return err; @@ -1528,7 +1536,8 @@ no_open: * Code common to create, mkdir, and mknod. */ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, + struct nfs4_label *label) { struct dentry *parent = dget_parent(dentry); struct inode *dir = parent->d_inode; @@ -1541,18 +1550,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, if (dentry->d_inode) goto out; if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) goto out_error; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (!(fattr->valid & NFS_ATTR_FATTR)) { struct nfs_server *server = NFS_SB(dentry->d_sb); - error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL); if (error < 0) goto out_error; } - inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); error = PTR_ERR(inode); if (IS_ERR(inode)) goto out_error; @@ -1721,7 +1730,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) dir->i_ino, dentry->d_name.name); spin_lock(&dentry->d_lock); - if (dentry->d_count > 1) { + if (d_count(dentry) > 1) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); @@ -1759,7 +1768,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink); */ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct pagevec lru_pvec; struct page *page; char *kaddr; struct iattr attr; @@ -1799,11 +1807,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - pagevec_init(&lru_pvec, 0); - if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, + if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { - pagevec_add(&lru_pvec, page); - pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else @@ -1870,7 +1875,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name, - new_dentry->d_count); + d_count(new_dentry)); /* * For non-directories, check whether the target is busy and if so, @@ -1888,7 +1893,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, rehash = new_dentry; } - if (new_dentry->d_count > 2) { + if (d_count(new_dentry) > 2) { int err; /* copy the target dentry's name */ diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 9455270..fc0f95e 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, kfree(ip_addr); return ret; } -EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); #else @@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, ret = -ESRCH; return ret; } -EXPORT_SYMBOL_GPL(nfs_dns_resolve_name); static struct cache_detail nfs_dns_resolve_template = { .owner = THIS_MODULE, @@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net) cache_destroy_net(nn->nfs_dns_resolve, net); } +static int nfs4_dns_net_init(struct net *net) +{ + return nfs_dns_resolver_cache_init(net); +} + +static void nfs4_dns_net_exit(struct net *net) +{ + nfs_dns_resolver_cache_destroy(net); +} + +static struct pernet_operations nfs4_dns_resolver_ops = { + .init = nfs4_dns_net_init, + .exit = nfs4_dns_net_exit, +}; + static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = { int nfs_dns_resolver_init(void) { - return rpc_pipefs_notifier_register(&nfs_dns_resolver_block); + int err; + + err = register_pernet_subsys(&nfs4_dns_resolver_ops); + if (err < 0) + goto out; + err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block); + if (err < 0) + goto out1; + return 0; +out1: + unregister_pernet_subsys(&nfs4_dns_resolver_ops); +out: + return err; } void nfs_dns_resolver_destroy(void) { rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); + unregister_pernet_subsys(&nfs4_dns_resolver_ops); } #endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a87a44f..94e94bd 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, * - Called if either PG_private or PG_fscache is set on the page * - Caller holds page lock */ -static void nfs_invalidate_page(struct page *page, unsigned long offset) +static void nfs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { - dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); + dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", + page, offset, length); - if (offset != 0) + if (offset != 0 || length < PAGE_CACHE_SIZE) return; /* Cancel any unstarted writes on this page */ nfs_wb_page_cancel(page_file_mapping(page)->host, page); @@ -493,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp) return nfs_fscache_release_page(page, gfp); } +static void nfs_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct nfs_inode *nfsi; + struct address_space *mapping = page_file_mapping(page); + + if (!mapping || PageSwapCache(page)) + return; + + /* + * Check if an unstable page is currently being committed and + * if so, have the VM treat it as if the page is under writeback + * so it will not block due to pages that will shortly be freeable. + */ + nfsi = NFS_I(mapping->host); + if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + *writeback = true; + return; + } + + /* + * If PagePrivate() is set, then the page is not freeable and as the + * inode is not being committed, it's not going to be cleaned in the + * near future so treat it as dirty + */ + if (PagePrivate(page)) + *dirty = true; +} + /* * Attempt to clear the private state associated with a page when an error * occurs that requires the cached contents of an inode to be written back or @@ -540,6 +571,7 @@ const struct address_space_operations nfs_file_aops = { .direct_IO = nfs_direct_IO, .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, + .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, #ifdef CONFIG_NFS_SWAP .swap_activate = nfs_swap_activate, diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 44efaa8..66984a9 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, goto out; } - inode = nfs_fhget(sb, mntfh, fsinfo.fattr); + inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL); if (IS_ERR(inode)) { dprintk("nfs_get_root: get root inode failed\n"); ret = ERR_CAST(inode); diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index c516da5..c2c4163 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, return desclen; } -static ssize_t nfs_idmap_request_key(struct key_type *key_type, - const char *name, size_t namelen, - const char *type, void *data, - size_t data_size, struct idmap *idmap) +static struct key *nfs_idmap_request_key(const char *name, size_t namelen, + const char *type, struct idmap *idmap) { - const struct cred *saved_cred; - struct key *rkey; char *desc; - struct user_key_payload *payload; + struct key *rkey; ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); if (ret <= 0) - goto out; + return ERR_PTR(ret); + + rkey = request_key(&key_type_id_resolver, desc, ""); + if (IS_ERR(rkey)) { + mutex_lock(&idmap->idmap_mutex); + rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, + desc, "", 0, idmap); + mutex_unlock(&idmap->idmap_mutex); + } + + kfree(desc); + return rkey; +} + +static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, + const char *type, void *data, + size_t data_size, struct idmap *idmap) +{ + const struct cred *saved_cred; + struct key *rkey; + struct user_key_payload *payload; + ssize_t ret; saved_cred = override_creds(id_resolver_cache); - if (idmap) - rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap); - else - rkey = request_key(&key_type_id_resolver, desc, ""); + rkey = nfs_idmap_request_key(name, namelen, type, idmap); revert_creds(saved_cred); - kfree(desc); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; @@ -316,23 +329,6 @@ out: return ret; } -static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, - const char *type, void *data, - size_t data_size, struct idmap *idmap) -{ - ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver, - name, namelen, type, data, - data_size, NULL); - if (ret < 0) { - mutex_lock(&idmap->idmap_mutex); - ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, - name, namelen, type, data, - data_size, idmap); - mutex_unlock(&idmap->idmap_mutex); - } - return ret; -} - /* ID -> Name */ static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen, struct idmap *idmap) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c1c7a9d..c93639e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -48,7 +48,6 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" -#include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" @@ -79,7 +78,7 @@ int nfs_wait_bit_killable(void *word) { if (fatal_signal_pending(current)) return -ERESTARTSYS; - freezable_schedule(); + freezable_schedule_unsafe(); return 0; } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); @@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode) memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; nfs_fscache_invalidate(inode); - } else { - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; - } + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_LABEL + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE; + } else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_LABEL + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE; } void nfs_zap_caches(struct inode *inode) @@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque) return 0; } +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + int error; + + if (label == NULL) + return; + + if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0) + return; + + if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2) + return; + + if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { + error = security_inode_notifysecctx(inode, label->label, + label->len); + if (error) + printk(KERN_ERR "%s() %s %d " + "security_inode_notifysecctx() %d\n", + __func__, + (char *)label->label, + label->len, error); + } +} + +struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) +{ + struct nfs4_label *label = NULL; + int minor_version = server->nfs_client->cl_minorversion; + + if (minor_version < 2) + return label; + + if (!(server->caps & NFS_CAP_SECURITY_LABEL)) + return label; + + label = kzalloc(sizeof(struct nfs4_label), flags); + if (label == NULL) + return ERR_PTR(-ENOMEM); + + label->label = kzalloc(NFS4_MAXLABELLEN, flags); + if (label->label == NULL) { + kfree(label); + return ERR_PTR(-ENOMEM); + } + label->len = NFS4_MAXLABELLEN; + + return label; +} +EXPORT_SYMBOL_GPL(nfs4_label_alloc); +#else +void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ +} +#endif +EXPORT_SYMBOL_GPL(nfs_setsecurity); + /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. */ struct inode * -nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) +nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs_find_desc desc = { .fh = fh, @@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } + + nfs_setsecurity(inode, fattr, label); + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; @@ -393,6 +463,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); + nfs_setsecurity(inode, fattr, label); dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), @@ -449,7 +520,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) - nfs_refresh_inode(inode, fattr); + error = nfs_refresh_inode(inode, fattr); nfs_free_fattr(fattr); out: return error; @@ -713,16 +784,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context); * Ensure that mmap has a recent RPC credential for use when writing out * shared pages */ -void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = ctx->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); - filp->private_data = get_nfs_open_context(ctx); spin_lock(&inode->i_lock); list_add(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); + +void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) +{ + filp->private_data = get_nfs_open_context(ctx); + if (list_empty(&ctx->list)) + nfs_inode_attach_open_context(ctx); +} EXPORT_SYMBOL_GPL(nfs_file_set_open_context); /* @@ -748,10 +826,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c static void nfs_file_clear_open_context(struct file *filp) { - struct inode *inode = file_inode(filp); struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { + struct inode *inode = ctx->dentry->d_inode; + filp->private_data = NULL; spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); @@ -790,6 +869,7 @@ int __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { int status = -ESTALE; + struct nfs4_label *label = NULL; struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); @@ -807,7 +887,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); - status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); + + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(label)) { + status = PTR_ERR(label); + goto out; + } + + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", inode->i_sb->s_id, @@ -817,7 +904,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (!S_ISDIR(inode->i_mode)) set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } - goto out; + goto err_out; } status = nfs_refresh_inode(inode, fattr); @@ -825,7 +912,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); - goto out; + goto err_out; } if (nfsi->cache_validity & NFS_INO_INVALID_ACL) @@ -835,7 +922,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) inode->i_sb->s_id, (long long)NFS_FILEID(inode)); - out: +err_out: + nfs4_label_free(label); +out: nfs_free_fattr(fattr); return status; } @@ -863,7 +952,8 @@ static int nfs_attribute_cache_expired(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) + if (!(NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) && !nfs_attribute_cache_expired(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); @@ -1243,6 +1333,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); + return status; } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); @@ -1483,7 +1574,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_blocks = fattr->du.nfs2.blocks; /* Update attrtimeo value if we're out of the unstable period */ - if (invalid & NFS_INO_INVALID_ATTR) { + if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -1496,6 +1587,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } invalid &= ~NFS_INO_INVALID_ATTR; + invalid &= ~NFS_INO_INVALID_LABEL; /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -1638,12 +1730,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { nfs_clients_init(net); - return nfs_dns_resolver_cache_init(net); + return 0; } static void nfs_net_exit(struct net *net) { - nfs_dns_resolver_cache_destroy(net); nfs_cleanup_cb_ident_idr(net); } @@ -1661,10 +1752,6 @@ static int __init init_nfs_fs(void) { int err; - err = nfs_dns_resolver_init(); - if (err < 0) - goto out10;; - err = register_pernet_subsys(&nfs_net_ops); if (err < 0) goto out9; @@ -1730,8 +1817,6 @@ out7: out8: unregister_pernet_subsys(&nfs_net_ops); out9: - nfs_dns_resolver_destroy(); -out10: return err; } @@ -1744,7 +1829,6 @@ static void __exit exit_nfs_fs(void) nfs_destroy_nfspagecache(); nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); - nfs_dns_resolver_destroy(); #ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 91e59a3..3c8373f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, - struct nfs4_sessionid *); + struct nfs4_sessionid *, u32); extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, struct nfs_subversion *); extern struct nfs_server *nfs4_create_server( @@ -255,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *, #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; extern const u32 nfs41_maxwrite_overhead; +extern const u32 nfs41_maxgetdevinfo_overhead; #endif /* nfs4proc.c */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 91a6faf..99a4528 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -139,7 +139,10 @@ struct mnt_fhstatus { * nfs_mount - Obtain an NFS file handle for the given host and path * @info: pointer to mount request arguments * - * Uses default timeout parameters specified by underlying transport. + * Uses default timeout parameters specified by underlying transport. On + * successful return, the auth_flavs list and auth_flav_len will be populated + * with the list from the server or a faked-up list if the server didn't + * provide one. */ int nfs_mount(struct nfs_mount_request *info) { @@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info) dprintk("NFS: MNT request succeeded\n"); status = 0; + /* + * If the server didn't provide a flavor list, allow the + * client to try any flavor. + */ + if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) { + dprintk("NFS: Faking up auth_flavs list\n"); + info->auth_flavs[0] = RPC_AUTH_NULL; + *info->auth_flav_len = 1; + } out: return status; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index fc8dc20..348b535 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, struct dentry *parent = dget_parent(dentry); /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr); + err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL); dput(parent); if (err != 0) return ERR_PTR(err); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 43ea96c..f5c84c3 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX) break; - freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; } while (!fatal_signal_pending(current)); return res; @@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], @@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, static int nfs3_proc_lookup(struct inode *dir, struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { struct nfs3_diropargs arg = { .fh = NFS_FH(dir), @@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_ status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); nfs_post_op_update_inode(dir, data->res.dir_attr); if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); return status; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a1dd768..ee81e35 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -194,7 +194,7 @@ struct nfs4_state_recovery_ops { int (*recover_lock)(struct nfs4_state *, struct file_lock *); int (*establish_clid)(struct nfs_client *, struct rpc_cred *); struct rpc_cred * (*get_clid_cred)(struct nfs_client *); - int (*reclaim_complete)(struct nfs_client *); + int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *); int (*detect_trunking)(struct nfs_client *, struct nfs_client **, struct rpc_cred *); }; @@ -303,10 +303,10 @@ is_ds_client(struct nfs_client *clp) extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; extern const u32 nfs4_fattr_bitmap[3]; -extern const u32 nfs4_statfs_bitmap[2]; -extern const u32 nfs4_pathconf_bitmap[2]; +extern const u32 nfs4_statfs_bitmap[3]; +extern const u32 nfs4_pathconf_bitmap[3]; extern const u32 nfs4_fsinfo_bitmap[3]; -extern const u32 nfs4_fs_locations_bitmap[2]; +extern const u32 nfs4_fs_locations_bitmap[3]; void nfs4_free_client(struct nfs_client *); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 947b0c9..90dce91 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) if (err) goto error; + if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) { + err = -EINVAL; + goto error; + } + spin_lock_init(&clp->cl_lock); INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); @@ -203,7 +208,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_NULL); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) goto error; @@ -562,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr, */ struct nfs_client * nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, - struct nfs4_sessionid *sid) + struct nfs4_sessionid *sid, u32 minorversion) { struct nfs_client *clp; struct nfs_net *nn = net_generic(net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { - if (nfs4_cb_match_client(addr, clp, 1) == false) + if (nfs4_cb_match_client(addr, clp, minorversion) == false) continue; if (!nfs4_has_session(clp)) @@ -592,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, struct nfs_client * nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, - struct nfs4_sessionid *sid) + struct nfs4_sessionid *sid, u32 minorversion) { return NULL; } @@ -626,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server, if (server->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (server->options & NFS_OPTION_MIGRATION) + set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); @@ -730,7 +737,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, return -ENOMEM; /* We must ensure the session is initialised first */ - error = nfs4_init_session(server); + error = nfs4_init_session(server->nfs_client); if (error < 0) goto out; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 13e6bb3..e5b804d 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) goto out_drop; } } - iput(inode); if (inode != dentry->d_inode) goto out_drop; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 22d1062..17ed87e 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -643,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, NFS_SERVER(lo->plh_inode)->nfs_client, id); if (d == NULL) { - dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags); + dsaddr = filelayout_get_device_info(lo->plh_inode, id, + lo->plh_lc_cred, gfp_flags); if (dsaddr == NULL) goto out; } else diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 235ff95..cebd20e 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -150,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); +filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, gfp_t gfp_flags); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 661a0f6..95604f6 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl * of available devices, and return it. */ struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) +filelayout_get_device_info(struct inode *inode, + struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, + gfp_t gfp_flags) { struct pnfs_device *pdev = NULL; u32 max_resp_sz; @@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf pdev->pgbase = 0; pdev->pglen = max_resp_sz; pdev->mincount = 0; + pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; - rc = nfs4_proc_getdeviceinfo(server, pdev); + rc = nfs4_proc_getdeviceinfo(server, pdev, cred); dprintk("%s getdevice info returns %d\n", __func__, rc); if (rc) goto out_free; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8fbc100..cf11799 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); -static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *); -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state); + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel); #ifdef CONFIG_NFS_V4_1 -static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *); -static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *); +static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, + struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, + struct rpc_cred *); #endif + +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *label) +{ + int err; + + if (label == NULL) + return NULL; + + if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) + return NULL; + + if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2) + return NULL; + + err = security_dentry_init_security(dentry, sattr->ia_mode, + &dentry->d_name, (void **)&label->label, &label->len); + if (err == 0) + return label; + + return NULL; +} +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ + if (label) + security_release_secctx(label->label, label->len); +} +static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ + if (label) + return server->attr_bitmask; + + return server->attr_bitmask_nl; +} +#else +static inline struct nfs4_label * +nfs4_label_init_security(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, struct nfs4_label *l) +{ return NULL; } +static inline void +nfs4_label_release_security(struct nfs4_label *label) +{ return; } +static inline u32 * +nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) +{ return server->attr_bitmask; } +#endif + /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) { @@ -134,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA - | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_TIME_MODIFY, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + FATTR4_WORD2_SECURITY_LABEL +#endif }; static const u32 nfs4_pnfs_open_bitmap[3] = { @@ -161,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = { | FATTR4_WORD0_FILEID, }; -const u32 nfs4_statfs_bitmap[2] = { +const u32 nfs4_statfs_bitmap[3] = { FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL, @@ -170,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = { | FATTR4_WORD1_SPACE_TOTAL }; -const u32 nfs4_pathconf_bitmap[2] = { +const u32 nfs4_pathconf_bitmap[3] = { FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME, 0 @@ -185,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE FATTR4_WORD2_LAYOUT_BLKSIZE }; -const u32 nfs4_fs_locations_bitmap[2] = { +const u32 nfs4_fs_locations_bitmap[3] = { FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE @@ -201,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = { | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY - | FATTR4_WORD1_MOUNTED_ON_FILEID + | FATTR4_WORD1_MOUNTED_ON_FILEID, }; static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, @@ -268,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) *timeout = NFS4_POLL_RETRY_MIN; if (*timeout > NFS4_POLL_RETRY_MAX) *timeout = NFS4_POLL_RETRY_MAX; - freezable_schedule_timeout_killable(*timeout); + freezable_schedule_timeout_killable_unsafe(*timeout); if (fatal_signal_pending(current)) res = -ERESTARTSYS; *timeout <<= 1; @@ -572,7 +628,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, task->tk_timeout = 0; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && !args->sa_privileged) { /* The state manager will wait until the slot table is empty */ dprintk("%s session is draining\n", __func__); @@ -762,6 +818,7 @@ struct nfs4_opendata { struct nfs4_string owner_name; struct nfs4_string group_name; struct nfs_fattr f_attr; + struct nfs4_label *f_label; struct dentry *dir; struct dentry *dentry; struct nfs4_state_owner *owner; @@ -807,6 +864,7 @@ nfs4_map_atomic_open_claim(struct nfs_server *server, static void nfs4_init_opendata_res(struct nfs4_opendata *p) { p->o_res.f_attr = &p->f_attr; + p->o_res.f_label = p->f_label; p->o_res.seqid = p->o_arg.seqid; p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; @@ -818,6 +876,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct nfs4_state_owner *sp, fmode_t fmode, int flags, const struct iattr *attrs, + struct nfs4_label *label, enum open_claim_type4 claim, gfp_t gfp_mask) { @@ -829,9 +888,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p = kzalloc(sizeof(*p), gfp_mask); if (p == NULL) goto err; + + p->f_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->f_label)) + goto err_free_p; + p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); if (p->o_arg.seqid == NULL) - goto err_free; + goto err_free_label; nfs_sb_active(dentry->d_sb); p->dentry = dget(dentry); p->dir = parent; @@ -852,8 +916,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.id.uniquifier = sp->so_seqid.owner_id; p->o_arg.name = &dentry->d_name; p->o_arg.server = server; - p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; + p->o_arg.label = label; p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: @@ -884,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, nfs4_init_opendata_res(p); kref_init(&p->kref); return p; -err_free: + +err_free_label: + nfs4_label_free(p->f_label); +err_free_p: kfree(p); err: dput(parent); @@ -901,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref) if (p->state != NULL) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); + + nfs4_label_free(p->f_label); + dput(p->dir); dput(p->dentry); nfs_sb_deactive(sb); @@ -1078,7 +1149,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; - int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC); + int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; nfs4_stateid stateid; int ret = -EAGAIN; @@ -1179,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (ret) goto err; + nfs_setsecurity(inode, &data->f_attr, data->f_label); + if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); update_open_stateid(state, &data->o_res.stateid, NULL, @@ -1205,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) ret = -EAGAIN; if (!(data->f_attr.valid & NFS_ATTR_FATTR)) goto err; - inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label); ret = PTR_ERR(inode); if (IS_ERR(inode)) goto err; @@ -1258,7 +1331,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context struct nfs4_opendata *opendata; opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, - NULL, claim, GFP_NOFS); + NULL, NULL, claim, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -1784,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) - _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr); + _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); return 0; } @@ -1855,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); nfs4_stateid *stateid = &state->stateid; - int status; + struct nfs_delegation *delegation; + struct rpc_cred *cred = NULL; + int status = -NFS4ERR_BAD_STATEID; /* If a state reset has been done, test_stateid is unneeded */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) return; - status = nfs41_test_stateid(server, stateid); + /* Get the delegation credential for use by test/free_stateid */ + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(state->inode)->delegation); + if (delegation != NULL && + nfs4_stateid_match(&delegation->stateid, stateid)) { + cred = get_rpccred(delegation->cred); + rcu_read_unlock(); + status = nfs41_test_stateid(server, stateid, cred); + } else + rcu_read_unlock(); + if (status != NFS_OK) { /* Free the stateid unless the server explicitly * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid); + nfs41_free_stateid(server, stateid, cred); nfs_remove_bad_delegation(state->inode); write_seqlock(&state->seqlock); @@ -1874,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state) write_sequnlock(&state->seqlock); clear_bit(NFS_DELEGATED_STATE, &state->flags); } + + if (cred != NULL) + put_rpccred(cred); } /** @@ -1888,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); nfs4_stateid *stateid = &state->open_stateid; + struct rpc_cred *cred = state->owner->so_cred; int status; /* If a state reset has been done, test_stateid is unneeded */ @@ -1896,12 +1985,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) return -NFS4ERR_BAD_STATEID; - status = nfs41_test_stateid(server, stateid); + status = nfs41_test_stateid(server, stateid, cred); if (status != NFS_OK) { /* Free the stateid unless the server explicitly * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid); + nfs41_free_stateid(server, stateid, cred); clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); @@ -1942,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, fmode_t fmode, int flags, - struct nfs4_state **res) + struct nfs_open_context *ctx) { struct nfs4_state_owner *sp = opendata->owner; struct nfs_server *server = sp->so_server; + struct dentry *dentry; struct nfs4_state *state; unsigned int seq; int ret; @@ -1963,13 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + dentry = opendata->dentry; + if (dentry->d_inode == NULL) { + /* FIXME: Is this d_drop() ever needed? */ + d_drop(dentry); + dentry = d_add_unique(dentry, igrab(state->inode)); + if (dentry == NULL) { + dentry = opendata->dentry; + } else if (dentry != ctx->dentry) { + dput(ctx->dentry); + ctx->dentry = dget(dentry); + } + nfs_set_verifier(dentry, + nfs_save_change_attribute(opendata->dir->d_inode)); + } + ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); if (ret != 0) goto out; - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) - nfs4_schedule_stateid_recovery(server, state); - *res = state; + ctx->state = state; + if (dentry->d_inode == state->inode) { + nfs_inode_attach_open_context(ctx); + if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) + nfs4_schedule_stateid_recovery(server, state); + } out: return ret; } @@ -1978,19 +2086,21 @@ out: * Returns a referenced nfs4_state */ static int _nfs4_do_open(struct inode *dir, - struct dentry *dentry, - fmode_t fmode, + struct nfs_open_context *ctx, int flags, struct iattr *sattr, - struct rpc_cred *cred, - struct nfs4_state **res, - struct nfs4_threshold **ctx_th) + struct nfs4_label *label) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *opendata; + struct dentry *dentry = ctx->dentry; + struct rpc_cred *cred = ctx->cred; + struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; + fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; + struct nfs4_label *olabel = NULL; int status; /* Protect against reboot recovery conflicts */ @@ -2009,22 +2119,31 @@ static int _nfs4_do_open(struct inode *dir, if (dentry->d_inode) claim = NFS4_OPEN_CLAIM_FH; opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, - claim, GFP_KERNEL); + label, claim, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; + if (label) { + olabel = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(olabel)) { + status = PTR_ERR(olabel); + goto err_opendata_put; + } + } + if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); if (!opendata->f_attr.mdsthreshold) - goto err_opendata_put; + goto err_free_label; opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } if (dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(dentry->d_inode, sp); - status = _nfs4_open_and_get_state(opendata, fmode, flags, &state); + status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); if (status != 0) - goto err_opendata_put; + goto err_free_label; + state = ctx->state; if ((opendata->o_arg.open_flags & O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { @@ -2033,10 +2152,12 @@ static int _nfs4_do_open(struct inode *dir, nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, opendata->o_res.f_attr, sattr, - state); - if (status == 0) + state, label, olabel); + if (status == 0) { nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + } } if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) @@ -2045,38 +2166,37 @@ static int _nfs4_do_open(struct inode *dir, kfree(opendata->f_attr.mdsthreshold); opendata->f_attr.mdsthreshold = NULL; + nfs4_label_free(olabel); + nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); - *res = state; return 0; +err_free_label: + nfs4_label_free(olabel); err_opendata_put: kfree(opendata->f_attr.mdsthreshold); nfs4_opendata_put(opendata); err_put_state_owner: nfs4_put_state_owner(sp); out_err: - *res = NULL; return status; } static struct nfs4_state *nfs4_do_open(struct inode *dir, - struct dentry *dentry, - fmode_t fmode, + struct nfs_open_context *ctx, int flags, struct iattr *sattr, - struct rpc_cred *cred, - struct nfs4_threshold **ctx_th) + struct nfs4_label *label) { struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_state *res; int status; - fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC; do { - status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, - &res, ctx_th); + status = _nfs4_do_open(dir, ctx, flags, sattr, label); + res = ctx->state; if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -2122,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state) + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_setattrargs arg = { @@ -2130,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .iap = sattr, .server = server, .bitmask = server->attr_bitmask, + .label = ilabel, }; struct nfs_setattrres res = { .fattr = fattr, + .label = olabel, .server = server, }; struct rpc_message msg = { @@ -2146,6 +2269,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, bool truncate; int status; + arg.bitmask = nfs4_bitmask(server, ilabel); + if (ilabel) + arg.bitmask = nfs4_bitmask(server, olabel); + nfs_fattr_init(fattr); /* Servers should only apply open mode checks for file size changes */ @@ -2172,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state) + struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_exception exception = { @@ -2181,7 +2309,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, }; int err; do { - err = _nfs4_do_setattr(inode, cred, fattr, sattr, state); + err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -2426,14 +2554,18 @@ static struct inode * nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) { struct nfs4_state *state; + struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; + + label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); /* Protect against concurrent sillydeletes */ - state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, - ctx->cred, &ctx->mdsthreshold); + state = nfs4_do_open(dir, ctx, open_flags, attr, label); + + nfs4_label_release_security(label); + if (IS_ERR(state)) return ERR_CAST(state); - ctx->state = state; - return igrab(state->inode); + return state->inode; } static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) @@ -2489,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_CTIME; if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) server->caps |= NFS_CAP_MTIME; +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) + server->caps |= NFS_CAP_SECURITY_LABEL; +#endif + memcpy(server->attr_bitmask_nl, res.attr_bitmask, + sizeof(server->attr_bitmask)); + if (server->caps & NFS_CAP_SECURITY_LABEL) { + server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + } memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; @@ -2515,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + u32 bitmask[3]; struct nfs4_lookup_root_arg args = { - .bitmask = nfs4_fattr_bitmap, + .bitmask = bitmask, }; struct nfs4_lookup_res res = { .server = server, @@ -2529,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; + bitmask[0] = nfs4_fattr_bitmap[0]; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* + * Process the label in the upcoming getfattr + */ + bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL; + nfs_fattr_init(info->fattr); return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } @@ -2648,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, { int error; struct nfs_fattr *fattr = info->fattr; + struct nfs4_label *label = NULL; error = nfs4_server_capabilities(server, mntfh); if (error < 0) { @@ -2655,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, return error; } - error = nfs4_proc_getattr(server, mntfh, fattr); + label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(label)) + return PTR_ERR(label); + + error = nfs4_proc_getattr(server, mntfh, fattr, label); if (error < 0) { dprintk("nfs4_get_root: getattr error = %d\n", -error); - return error; + goto err_free_label; } if (fattr->valid & NFS_ATTR_FATTR_FSID && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); +err_free_label: + nfs4_label_free(label); + return error; } @@ -2711,7 +2869,8 @@ out: return status; } -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_getattr_arg args = { .fh = fhandle, @@ -2719,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; struct nfs4_getattr_res res = { .fattr = fattr, + .label = label, .server = server, }; struct rpc_message msg = { @@ -2726,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - + + args.bitmask = nfs4_bitmask(server, label); + nfs_fattr_init(fattr); return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } -static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(server, - _nfs4_proc_getattr(server, fhandle, fattr), + _nfs4_proc_getattr(server, fhandle, fattr, label), &exception); } while (exception.retry); return err; @@ -2767,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct inode *inode = dentry->d_inode; struct rpc_cred *cred = NULL; struct nfs4_state *state = NULL; + struct nfs4_label *label = NULL; int status; if (pnfs_ld_layoutret_on_setattr(inode)) @@ -2793,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } } - status = nfs4_do_setattr(inode, cred, fattr, sattr, state); - if (status == 0) + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(label)) + return PTR_ERR(label); + + status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); + if (status == 0) { nfs_setattr_update_inode(inode, sattr); + nfs_setsecurity(inode, fattr, label); + } + nfs4_label_free(label); return status; } static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs_server *server = NFS_SERVER(dir); int status; @@ -2813,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct nfs4_lookup_res res = { .server = server, .fattr = fattr, + .label = label, .fh = fhandle, }; struct rpc_message msg = { @@ -2821,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, .rpc_resp = &res, }; + args.bitmask = nfs4_bitmask(server, label); + nfs_fattr_init(fattr); dprintk("NFS call lookup %s\n", name->name); @@ -2839,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs4_exception exception = { }; struct rpc_clnt *client = *clnt; int err; do { - err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr); + err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label); switch (err) { case -NFS4ERR_BADNAME: err = -ENOENT; @@ -2879,12 +3053,13 @@ out: } static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { int status; struct rpc_clnt *client = NFS_CLIENT(dir); - status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label); if (client != NFS_CLIENT(dir)) { rpc_shutdown_client(client); nfs_fixup_secinfo_attributes(fattr); @@ -2899,7 +3074,7 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, int status; struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir)); - status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); if (status < 0) { rpc_shutdown_client(client); return ERR_PTR(status); @@ -2924,7 +3099,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_cred = entry->cred, }; int mode = entry->mask; - int status; + int status = 0; /* * Determine which access bits we want to ask for... @@ -3029,6 +3204,7 @@ static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { + struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; int status = 0; @@ -3037,19 +3213,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (IS_ERR(ctx)) return PTR_ERR(ctx); + ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); + sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, dentry, ctx->mode, - flags, sattr, ctx->cred, - &ctx->mdsthreshold); - d_drop(dentry); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel); if (IS_ERR(state)) { status = PTR_ERR(state); goto out; } - d_add(dentry, igrab(state->inode)); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - ctx->state = state; out: + nfs4_label_release_security(ilabel); put_nfs_open_context(ctx); return status; } @@ -3098,6 +3271,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) res->server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); + + nfs_fattr_init(res->dir_attr); } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -3173,7 +3348,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, .rpc_resp = &res, }; int status = -ENOMEM; - + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(old_dir, &res.old_cinfo); @@ -3207,6 +3382,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * }; struct nfs4_link_res res = { .server = server, + .label = NULL, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], @@ -3219,11 +3395,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * if (res.fattr == NULL) goto out; + res.label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(res.label)) { + status = PTR_ERR(res.label); + goto out; + } + arg.bitmask = nfs4_bitmask(server, res.label); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo); - nfs_post_op_update_inode(inode, res.fattr); + status = nfs_post_op_update_inode(inode, res.fattr); + if (!status) + nfs_setsecurity(inode, res.fattr, res.label); } + + + nfs4_label_free(res.label); + out: nfs_free_fattr(res.fattr); return status; @@ -3247,6 +3436,7 @@ struct nfs4_createdata { struct nfs4_create_res res; struct nfs_fh fh; struct nfs_fattr fattr; + struct nfs4_label *label; }; static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, @@ -3258,6 +3448,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, if (data != NULL) { struct nfs_server *server = NFS_SERVER(dir); + data->label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(data->label)) + goto out_free; + data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; data->msg.rpc_argp = &data->arg; data->msg.rpc_resp = &data->res; @@ -3266,13 +3460,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, data->arg.name = name; data->arg.attrs = sattr; data->arg.ftype = ftype; - data->arg.bitmask = server->attr_bitmask; + data->arg.bitmask = nfs4_bitmask(server, data->label); data->res.server = server; data->res.fh = &data->fh; data->res.fattr = &data->fattr; + data->res.label = data->label; nfs_fattr_init(data->res.fattr); } return data; +out_free: + kfree(data); + return NULL; } static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) @@ -3281,18 +3479,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { update_changeattr(dir, &data->res.dir_cinfo); - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); } return status; } static void nfs4_free_createdata(struct nfs4_createdata *data) { + nfs4_label_free(data->label); kfree(data); } static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr) + struct page *page, unsigned int len, struct iattr *sattr, + struct nfs4_label *label) { struct nfs4_createdata *data; int status = -ENAMETOOLONG; @@ -3308,6 +3508,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; data->arg.u.symlink.pages = &page; data->arg.u.symlink.len = len; + data->arg.label = label; status = nfs4_do_create(dir, dentry, data); @@ -3320,18 +3521,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + + label = nfs4_label_init_security(dir, dentry, sattr, &l); + do { err = nfs4_handle_exception(NFS_SERVER(dir), _nfs4_proc_symlink(dir, dentry, page, - len, sattr), + len, sattr, label), &exception); } while (exception.retry); + + nfs4_label_release_security(label); return err; } static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) + struct iattr *sattr, struct nfs4_label *label) { struct nfs4_createdata *data; int status = -ENOMEM; @@ -3340,6 +3547,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (data == NULL) goto out; + data->arg.label = label; status = nfs4_do_create(dir, dentry, data); nfs4_free_createdata(data); @@ -3351,14 +3559,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + label = nfs4_label_init_security(dir, dentry, sattr, &l); + sattr->ia_mode &= ~current_umask(); do { err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_mkdir(dir, dentry, sattr), + _nfs4_proc_mkdir(dir, dentry, sattr, label), &exception); } while (exception.retry); + nfs4_label_release_security(label); + return err; } @@ -3416,7 +3629,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, } static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, dev_t rdev) + struct iattr *sattr, struct nfs4_label *label, dev_t rdev) { struct nfs4_createdata *data; int mode = sattr->ia_mode; @@ -3441,7 +3654,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, status = -EINVAL; goto out_free; } - + + data->arg.label = label; status = nfs4_do_create(dir, dentry, data); out_free: nfs4_free_createdata(data); @@ -3453,14 +3667,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { struct nfs4_exception exception = { }; + struct nfs4_label l, *label = NULL; int err; + label = nfs4_label_init_security(dir, dentry, sattr, &l); + sattr->ia_mode &= ~current_umask(); do { err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_mknod(dir, dentry, sattr, rdev), + _nfs4_proc_mknod(dir, dentry, sattr, label, rdev), &exception); } while (exception.retry); + + nfs4_label_release_security(label); + return err; } @@ -4187,6 +4407,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen return err; } +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static int _nfs4_get_security_label(struct inode *inode, void *buf, + size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr fattr; + struct nfs4_label label = {0, 0, buflen, buf}; + + u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs4_getattr_arg args = { + .fh = NFS_FH(inode), + .bitmask = bitmask, + }; + struct nfs4_getattr_res res = { + .fattr = &fattr, + .label = &label, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int ret; + + nfs_fattr_init(&fattr); + + ret = rpc_call_sync(server->client, &msg, 0); + if (ret) + return ret; + if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) + return -ENOENT; + if (buflen < label.len) + return -ERANGE; + return 0; +} + +static int nfs4_get_security_label(struct inode *inode, void *buf, + size_t buflen) +{ + struct nfs4_exception exception = { }; + int err; + + if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) + return -EOPNOTSUPP; + + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + _nfs4_get_security_label(inode, buf, buflen), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_do_set_security_label(struct inode *inode, + struct nfs4_label *ilabel, + struct nfs_fattr *fattr, + struct nfs4_label *olabel) +{ + + struct iattr sattr = {0}; + struct nfs_server *server = NFS_SERVER(inode); + const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs_setattrargs args = { + .fh = NFS_FH(inode), + .iap = &sattr, + .server = server, + .bitmask = bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + nfs4_stateid_copy(&args.stateid, &zero_stateid); + + status = rpc_call_sync(server->client, &msg, 0); + if (status) + dprintk("%s failed: %d\n", __func__, status); + + return status; +} + +static int nfs4_do_set_security_label(struct inode *inode, + struct nfs4_label *ilabel, + struct nfs_fattr *fattr, + struct nfs4_label *olabel) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + _nfs4_do_set_security_label(inode, ilabel, + fattr, olabel), + &exception); + } while (exception.retry); + return err; +} + +static int +nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +{ + struct nfs4_label ilabel, *olabel = NULL; + struct nfs_fattr fattr; + struct rpc_cred *cred; + struct inode *inode = dentry->d_inode; + int status; + + if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) + return -EOPNOTSUPP; + + nfs_fattr_init(&fattr); + + ilabel.pi = 0; + ilabel.lfs = 0; + ilabel.label = (char *)buf; + ilabel.len = buflen; + + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + + olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (IS_ERR(olabel)) { + status = -PTR_ERR(olabel); + goto out; + } + + status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel); + if (status == 0) + nfs_setsecurity(inode, &fattr, olabel); + + nfs4_label_free(olabel); +out: + put_rpccred(cred); + return status; +} +#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ + + static int nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) { @@ -4345,7 +4714,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, /* cb_client4 */ rcu_read_lock(); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, - sizeof(setclientid.sc_netid), + sizeof(setclientid.sc_netid), "%s", rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_NETID)); rcu_read_unlock(); @@ -4528,7 +4897,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 static unsigned long nfs4_set_lock_task_retry(unsigned long timeout) { - freezable_schedule_timeout_killable(timeout); + freezable_schedule_timeout_killable_unsafe(timeout); timeout <<= 1; if (timeout > NFS4_LOCK_MAXTIMEOUT) return NFS4_LOCK_MAXTIMEOUT; @@ -5056,13 +5425,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) list_for_each_entry(lsp, &state->lock_states, ls_locks) { if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - status = nfs41_test_stateid(server, &lsp->ls_stateid); + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + status = nfs41_test_stateid(server, + &lsp->ls_stateid, + cred); if (status != NFS_OK) { /* Free the stateid unless the server * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) nfs41_free_stateid(server, - &lsp->ls_stateid); + &lsp->ls_stateid, + cred); clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); ret = status; } @@ -5295,6 +5669,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, return len; } +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +static inline int nfs4_server_supports_labels(struct nfs_server *server) +{ + return server->caps & NFS_CAP_SECURITY_LABEL; +} + +static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key, + const void *buf, size_t buflen, + int flags, int type) +{ + if (security_ismaclabel(key)) + return nfs4_set_security_label(dentry, buf, buflen); + + return -EOPNOTSUPP; +} + +static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, + void *buf, size_t buflen, int type) +{ + if (security_ismaclabel(key)) + return nfs4_get_security_label(dentry->d_inode, buf, buflen); + return -EOPNOTSUPP; +} + +static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, + size_t list_len, const char *name, + size_t name_len, int type) +{ + size_t len = 0; + + if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(dentry->d_inode, NULL, 0); + if (list && len <= list_len) + security_inode_listsecurity(dentry->d_inode, list, len); + } + return len; +} + +static const struct xattr_handler nfs4_xattr_nfs4_label_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = nfs4_xattr_list_nfs4_label, + .get = nfs4_xattr_get_nfs4_label, + .set = nfs4_xattr_set_nfs4_label, +}; +#endif + + /* * nfs_fhget will use either the mounted_on_fileid or the fileid */ @@ -5318,7 +5739,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct page *page) { struct nfs_server *server = NFS_SERVER(dir); - u32 bitmask[2] = { + u32 bitmask[3] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { @@ -5505,7 +5926,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) struct nfs41_exchange_id_args args = { .verifier = &verifier, .client = clp, - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, + .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, }; struct nfs41_exchange_id_res res = { 0 @@ -5762,17 +6184,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) */ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) { - struct nfs4_session *session = args->client->cl_session; - unsigned int mxrqst_sz = session->fc_target_max_rqst_sz, - mxresp_sz = session->fc_target_max_resp_sz; + unsigned int max_rqst_sz, max_resp_sz; + + max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; + max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; - if (mxrqst_sz == 0) - mxrqst_sz = NFS_MAX_FILE_IO_SIZE; - if (mxresp_sz == 0) - mxresp_sz = NFS_MAX_FILE_IO_SIZE; /* Fore channel attributes */ - args->fc_attrs.max_rqst_sz = mxrqst_sz; - args->fc_attrs.max_resp_sz = mxresp_sz; + args->fc_attrs.max_rqst_sz = max_rqst_sz; + args->fc_attrs.max_resp_sz = max_resp_sz; args->fc_attrs.max_ops = NFS4_MAX_OPS; args->fc_attrs.max_reqs = max_session_slots; @@ -6159,12 +6578,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = { /* * Issue a global reclaim complete. */ -static int nfs41_proc_reclaim_complete(struct nfs_client *clp) +static int nfs41_proc_reclaim_complete(struct nfs_client *clp, + struct rpc_cred *cred) { struct nfs4_reclaim_complete_data *calldata; struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], + .rpc_cred = cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, @@ -6348,6 +6769,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], .rpc_argp = &lgp->args, .rpc_resp = &lgp->res, + .rpc_cred = lgp->cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = server->client, @@ -6451,6 +6873,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], .rpc_argp = &lrp->args, .rpc_resp = &lrp->res, + .rpc_cred = lrp->cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = lrp->clp->cl_rpcclient, @@ -6520,7 +6943,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server, EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); static int -_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +_nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *pdev, + struct rpc_cred *cred) { struct nfs4_getdeviceinfo_args args = { .pdev = pdev, @@ -6532,6 +6957,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], .rpc_argp = &args, .rpc_resp = &res, + .rpc_cred = cred, }; int status; @@ -6542,14 +6968,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) return status; } -int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *pdev, + struct rpc_cred *cred) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(server, - _nfs4_proc_getdeviceinfo(server, pdev), + _nfs4_proc_getdeviceinfo(server, pdev, cred), &exception); } while (exception.retry); return err; @@ -6733,7 +7161,9 @@ out: return err; } -static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) +static int _nfs41_test_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) { int status; struct nfs41_test_stateid_args args = { @@ -6744,6 +7174,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], .rpc_argp = &args, .rpc_resp = &res, + .rpc_cred = cred, }; dprintk("NFS call test_stateid %p\n", stateid); @@ -6764,17 +7195,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) * * @server: server / transport on which to perform the operation * @stateid: state ID to test + * @cred: credential * * Returns NFS_OK if the server recognizes that "stateid" is valid. * Otherwise a negative NFS4ERR value is returned if the operation * failed or the state ID is not currently valid. */ -static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) +static int nfs41_test_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) { struct nfs4_exception exception = { }; int err; do { - err = _nfs41_test_stateid(server, stateid); + err = _nfs41_test_stateid(server, stateid, cred); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -6823,10 +7257,12 @@ const struct rpc_call_ops nfs41_free_stateid_ops = { static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid, + struct rpc_cred *cred, bool privileged) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], + .rpc_cred = cred, }; struct rpc_task_setup task_setup = { .rpc_client = server->client, @@ -6859,16 +7295,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, * * @server: server / transport on which to perform the operation * @stateid: state ID to release + * @cred: credential * * Returns NFS_OK if the server freed "stateid". Otherwise a * negative NFS4ERR value is returned. */ -static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) +static int nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) { struct rpc_task *task; int ret; - task = _nfs41_free_stateid(server, stateid, true); + task = _nfs41_free_stateid(server, stateid, cred, true); if (IS_ERR(task)) return PTR_ERR(task); ret = rpc_wait_for_completion_task(task); @@ -6881,8 +7320,9 @@ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { struct rpc_task *task; + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - task = _nfs41_free_stateid(server, &lsp->ls_stateid, false); + task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); nfs4_free_lock_state(server, lsp); if (IS_ERR(task)) return PTR_ERR(task); @@ -7004,11 +7444,33 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { }; #endif +#if defined(CONFIG_NFS_V4_2) +static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { + .minor_version = 2, + .init_caps = NFS_CAP_READDIRPLUS + | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_CHANGE_ATTR + | NFS_CAP_POSIX_LOCK + | NFS_CAP_STATEID_NFSV41 + | NFS_CAP_ATOMIC_OPEN_V1, + .call_sync = nfs4_call_sync_sequence, + .match_stateid = nfs41_match_stateid, + .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, +}; +#endif + const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { [0] = &nfs_v4_0_minor_ops, #if defined(CONFIG_NFS_V4_1) [1] = &nfs_v4_1_minor_ops, #endif +#if defined(CONFIG_NFS_V4_2) + [2] = &nfs_v4_2_minor_ops, +#endif }; const struct inode_operations nfs4_dir_inode_operations = { @@ -7108,6 +7570,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { const struct xattr_handler *nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + &nfs4_xattr_nfs4_label_handler, +#endif NULL }; diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index ebda5f4..36e21cb 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -73,7 +73,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) tbl->highest_used_slotid = new_max; else { tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(tbl->session, tbl); + nfs4_slot_tbl_drain_complete(tbl); } } dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, @@ -226,7 +226,7 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) struct nfs4_slot *slot = pslot; struct nfs4_slot_table *tbl = slot->table; - if (nfs4_session_draining(tbl->session) && !args->sa_privileged) + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) return false; slot->generation = tbl->generation; args->sa_slot = slot; @@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp) return 0; } -int nfs4_init_session(struct nfs_server *server) +int nfs4_init_session(struct nfs_client *clp) { - struct nfs_client *clp = server->nfs_client; - struct nfs4_session *session; - unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE; - unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE; - if (!nfs4_has_session(clp)) return 0; - if (server->rsize != 0) - target_max_resp_sz = server->rsize; - target_max_resp_sz += nfs41_maxread_overhead; - - if (server->wsize != 0) - target_max_rqst_sz = server->wsize; - target_max_rqst_sz += nfs41_maxwrite_overhead; - - session = clp->cl_session; - spin_lock(&clp->cl_lock); - if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { - /* Initialise targets and channel attributes */ - session->fc_target_max_rqst_sz = target_max_rqst_sz; - session->fc_attrs.max_rqst_sz = target_max_rqst_sz; - session->fc_target_max_resp_sz = target_max_resp_sz; - session->fc_attrs.max_resp_sz = target_max_resp_sz; - } else { - /* Just adjust the targets */ - if (target_max_rqst_sz > session->fc_target_max_rqst_sz) { - session->fc_target_max_rqst_sz = target_max_rqst_sz; - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - } - if (target_max_resp_sz > session->fc_target_max_resp_sz) { - session->fc_target_max_resp_sz = target_max_resp_sz; - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - } - } - spin_unlock(&clp->cl_lock); - - if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) - nfs4_schedule_lease_recovery(clp); - + clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state); return nfs41_check_session_ready(clp); } diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 6f3cb39..3a153d8 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -25,6 +25,10 @@ struct nfs4_slot { }; /* Sessions */ +enum nfs4_slot_tbl_state { + NFS4_SLOT_TBL_DRAINING, +}; + #define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) struct nfs4_slot_table { struct nfs4_session *session; /* Parent session */ @@ -43,6 +47,7 @@ struct nfs4_slot_table { unsigned long generation; /* Generation counter for target_highest_slotid */ struct completion complete; + unsigned long slot_tbl_state; }; /* @@ -61,14 +66,10 @@ struct nfs4_session { struct nfs4_channel_attrs bc_attrs; struct nfs4_slot_table bc_slot_table; struct nfs_client *clp; - /* Create session arguments */ - unsigned int fc_target_max_rqst_sz; - unsigned int fc_target_max_resp_sz; }; enum nfs4_session_state { NFS4_SESSION_INITING, - NFS4_SESSION_DRAINING, }; #if defined(CONFIG_NFS_V4_1) @@ -85,15 +86,14 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern void nfs4_destroy_session(struct nfs4_session *session); -extern int nfs4_init_session(struct nfs_server *server); +extern int nfs4_init_session(struct nfs_client *clp); extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); -extern void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl); +extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); -static inline bool nfs4_session_draining(struct nfs4_session *session) +static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) { - return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); + return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); } bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, @@ -119,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) #else /* defined(CONFIG_NFS_V4_1) */ -static inline int nfs4_init_session(struct nfs_server *server) +static inline int nfs4_init_session(struct nfs_client *clp) { return 0; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 300d17d..e22862f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -228,38 +228,37 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) return status; } -/* - * Back channel returns NFS4ERR_DELAY for new requests when - * NFS4_SESSION_DRAINING is set so there is no work to be done when draining - * is ended. - */ -static void nfs4_end_drain_session(struct nfs_client *clp) +static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl) { - struct nfs4_session *ses = clp->cl_session; - struct nfs4_slot_table *tbl; - - if (ses == NULL) - return; - tbl = &ses->fc_slot_table; - if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { spin_lock(&tbl->slot_tbl_lock); nfs41_wake_slot_table(tbl); spin_unlock(&tbl->slot_tbl_lock); } } +static void nfs4_end_drain_session(struct nfs_client *clp) +{ + struct nfs4_session *ses = clp->cl_session; + + if (ses != NULL) { + nfs4_end_drain_slot_table(&ses->bc_slot_table); + nfs4_end_drain_slot_table(&ses->fc_slot_table); + } +} + /* * Signal state manager thread if session fore channel is drained */ -void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl) +void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) { - if (nfs4_session_draining(session)) + if (nfs4_slot_tbl_draining(tbl)) complete(&tbl->complete); } -static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) +static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) { + set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); spin_lock(&tbl->slot_tbl_lock); if (tbl->highest_used_slotid != NFS4_NO_SLOT) { INIT_COMPLETION(tbl->complete); @@ -275,13 +274,12 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) struct nfs4_session *ses = clp->cl_session; int ret = 0; - set_bit(NFS4_SESSION_DRAINING, &ses->session_state); /* back channel */ - ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table); + ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); if (ret) return ret; /* fore channel */ - return nfs4_wait_on_slot_tbl(&ses->fc_slot_table); + return nfs4_drain_slot_tbl(&ses->fc_slot_table); } static void nfs41_finish_session_reset(struct nfs_client *clp) @@ -1195,7 +1193,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) snprintf(buf, sizeof(buf), "%s-manager", rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); rcu_read_unlock(); - task = kthread_run(nfs4_run_state_manager, clp, buf); + task = kthread_run(nfs4_run_state_manager, clp, "%s", buf); if (IS_ERR(task)) { printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); @@ -1374,13 +1372,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ /* Guard against delegation returns and new lock/unlock calls */ down_write(&nfsi->rwsem); /* Protect inode->i_flock using the BKL */ - lock_flocks(); + spin_lock(&inode->i_lock); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; if (nfs_file_open_context(fl->fl_file)->state != state) continue; - unlock_flocks(); + spin_unlock(&inode->i_lock); status = ops->recover_lock(state, fl); switch (status) { case 0: @@ -1407,9 +1405,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ /* kill_proc(fl->fl_pid, SIGLOST, 1); */ status = 0; } - lock_flocks(); + spin_lock(&inode->i_lock); } - unlock_flocks(); + spin_unlock(&inode->i_lock); out: up_write(&nfsi->rwsem); return status; @@ -1564,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) } static void nfs4_reclaim_complete(struct nfs_client *clp, - const struct nfs4_state_recovery_ops *ops) + const struct nfs4_state_recovery_ops *ops, + struct rpc_cred *cred) { /* Notify the server we're done reclaiming our state */ if (ops->reclaim_complete) - (void)ops->reclaim_complete(clp); + (void)ops->reclaim_complete(clp, cred); } static void nfs4_clear_reclaim_server(struct nfs_server *server) @@ -1613,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { + const struct nfs4_state_recovery_ops *ops; + struct rpc_cred *cred; + if (!nfs4_state_clear_reclaim_reboot(clp)) return; - nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + ops = clp->cl_mvops->reboot_recovery_ops; + cred = ops->get_clid_cred(clp); + nfs4_reclaim_complete(clp, ops, cred); + put_rpccred(cred); } static void nfs_delegation_clear_all(struct nfs_client *clp) diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index a5e1a30..5dbe2d2 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -9,6 +9,7 @@ #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" +#include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" @@ -331,18 +332,24 @@ static int __init init_nfs_v4(void) { int err; - err = nfs_idmap_init(); + err = nfs_dns_resolver_init(); if (err) goto out; - err = nfs4_register_sysctl(); + err = nfs_idmap_init(); if (err) goto out1; + err = nfs4_register_sysctl(); + if (err) + goto out2; + register_nfs_version(&nfs_v4); return 0; -out1: +out2: nfs_idmap_quit(); +out1: + nfs_dns_resolver_destroy(); out: return err; } @@ -352,6 +359,7 @@ static void __exit exit_nfs_v4(void) unregister_nfs_version(&nfs_v4); nfs4_unregister_sysctl(); nfs_idmap_quit(); + nfs_dns_resolver_destroy(); } MODULE_LICENSE("GPL"); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4be8d13..0abfb846 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int); #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) #define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#ifdef CONFIG_NFS_V4_SECURITY_LABEL +/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */ +#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN)) +#define encode_readdir_space 24 +#define encode_readdir_bitmask_sz 3 +#else +#define nfs4_label_maxsz 0 +#define encode_readdir_space 20 +#define encode_readdir_bitmask_sz 2 +#endif /* We support only one layout type per file system */ #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ 3 + 3 + 3 + nfs4_owner_maxsz + \ - nfs4_group_maxsz + decode_mdsthreshold_maxsz)) + nfs4_group_maxsz + nfs4_label_maxsz + \ + decode_mdsthreshold_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ nfs4_fattr_value_maxsz) #define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) @@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int); 1 + 2 + 1 + \ nfs4_owner_maxsz + \ nfs4_group_maxsz + \ + nfs4_label_maxsz + \ 4 + 4) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) @@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int); encode_stateid_maxsz + 3) #define decode_read_maxsz (op_decode_hdr_maxsz + 2) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ - 2 + encode_verifier_maxsz + 5) + 2 + encode_verifier_maxsz + 5 + \ + nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz) + decode_verifier_maxsz + \ + nfs4_label_maxsz + nfs4_fattr_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) #define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) #define encode_write_maxsz (op_encode_hdr_maxsz + \ @@ -853,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH + decode_sequence_maxsz + decode_putfh_maxsz) * XDR_UNIT); + +const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz) * + XDR_UNIT); +EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead); #endif /* CONFIG_NFS_V4_1 */ static const umode_t nfs_type2fmt[] = { @@ -968,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); } -static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) +static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, + const struct nfs4_label *label, + const struct nfs_server *server) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -979,15 +1001,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const int len; uint32_t bmval0 = 0; uint32_t bmval1 = 0; + uint32_t bmval2 = 0; /* * We reserve enough space to write the entire attribute buffer at once. * In the worst-case, this would be - * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) - * = 36 bytes, plus any contribution from variable-length fields + * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) + * = 40 bytes, plus any contribution from variable-length fields * such as owner/group. */ - len = 16; + len = 20; /* Sigh */ if (iap->ia_valid & ATTR_SIZE) @@ -1017,6 +1040,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const } len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } + if (label) + len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); if (iap->ia_valid & ATTR_ATIME_SET) len += 16; else if (iap->ia_valid & ATTR_ATIME) @@ -1031,9 +1056,9 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const * We write the bitmap length now, but leave the bitmap and the attribute * buffer length to be backfilled at the end of this routine. */ - *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(3); q = p; - p += 3; + p += 4; if (iap->ia_valid & ATTR_SIZE) { bmval0 |= FATTR4_WORD0_SIZE; @@ -1071,6 +1096,13 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } + if (label) { + bmval2 |= FATTR4_WORD2_SECURITY_LABEL; + *p++ = cpu_to_be32(label->lfs); + *p++ = cpu_to_be32(label->pi); + *p++ = cpu_to_be32(label->len); + p = xdr_encode_opaque_fixed(p, label->label, label->len); + } /* * Now we backfill the bitmap and the attribute buffer length. @@ -1080,9 +1112,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len, ((char *)p - (char *)q) + 4); BUG(); } - len = (char *)p - (char *)q - 12; + len = (char *)p - (char *)q - 16; *q++ = htonl(bmval0); *q++ = htonl(bmval1); + *q++ = htonl(bmval2); *q = htonl(len); /* out: */ @@ -1136,7 +1169,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->server); + encode_attrs(xdr, create->attrs, create->label, create->server); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1188,8 +1221,10 @@ encode_getattr_three(struct xdr_stream *xdr, static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1], hdr); + encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], + bitmask[1] & nfs4_fattr_bitmap[1], + bitmask[2] & nfs4_fattr_bitmap[2], + hdr); } static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, @@ -1367,11 +1402,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1381,7 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); dummy.ia_valid = 0; - encode_attrs(xdr, &dummy, arg->server); + encode_attrs(xdr, &dummy, arg->label, arg->server); } } @@ -1532,7 +1567,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { - uint32_t attrs[2] = { + uint32_t attrs[3] = { FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; @@ -1555,20 +1590,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); encode_uint64(xdr, readdir->cookie); encode_nfs4_verifier(xdr, &readdir->verifier); - p = reserve_space(xdr, 20); + p = reserve_space(xdr, encode_readdir_space); *p++ = cpu_to_be32(dircount); *p++ = cpu_to_be32(readdir->count); - *p++ = cpu_to_be32(2); - + *p++ = cpu_to_be32(encode_readdir_bitmask_sz); *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); - *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); + *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); + if (encode_readdir_bitmask_sz > 2) { + if (hdr->minorversion > 1) + attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; + p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]); + } memcpy(verf, readdir->verifier.data, sizeof(verf)); - dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", + + dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n", __func__, (unsigned long long)readdir->cookie, verf[0], verf[1], attrs[0] & readdir->bitmask[0], - attrs[1] & readdir->bitmask[1]); + attrs[1] & readdir->bitmask[1], + attrs[2] & readdir->bitmask[2]); } static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) @@ -1627,7 +1668,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, server); + encode_attrs(xdr, arg->iap, arg->label, server); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -1889,7 +1930,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(args->pdev->layout_type); - *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ + *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ *p++ = cpu_to_be32(0); /* bitmap length 0 */ } @@ -4038,6 +4079,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, return status; } +static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_label *label) +{ + uint32_t pi = 0; + uint32_t lfs = 0; + __u32 len; + __be32 *p; + int status = 0; + + if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + lfs = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pi = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p++); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (len < NFS4_MAXLABELLEN) { + if (label) { + memcpy(label->label, p, len); + label->len = len; + label->pi = pi; + label->lfs = lfs; + status = NFS_ATTR_FATTR_V4_SECURITY_LABEL; + } + bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + } else + printk(KERN_WARNING "%s: label too long (%u)!\n", + __func__, len); + } + if (label && label->label) + dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, + (char *)label->label, label->len, label->pi, label->lfs); + return status; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) { int status = 0; @@ -4380,7 +4471,7 @@ out_overflow: static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fattr *fattr, struct nfs_fh *fh, - struct nfs4_fs_locations *fs_loc, + struct nfs4_fs_locations *fs_loc, struct nfs4_label *label, const struct nfs_server *server) { int status; @@ -4488,6 +4579,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, if (status < 0) goto xdr_error; + if (label) { + status = decode_attr_security_label(xdr, bitmap, label); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + } + xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; @@ -4495,7 +4593,7 @@ xdr_error: static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, - const struct nfs_server *server) + struct nfs4_label *label, const struct nfs_server *server) { unsigned int savep; uint32_t attrlen, @@ -4514,7 +4612,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat if (status < 0) goto xdr_error; - status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server); + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, + label, server); if (status < 0) goto xdr_error; @@ -4524,10 +4623,16 @@ xdr_error: return status; } +static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct nfs4_label *label, const struct nfs_server *server) +{ + return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server); +} + static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) { - return decode_getfattr_generic(xdr, fattr, NULL, NULL, server); + return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server); } /* @@ -5919,7 +6024,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - status = decode_getfattr(xdr, res->fattr, res->server); + status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -5945,7 +6050,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, goto out; status = decode_getfh(xdr, res->fh); if (status == 0) - status = decode_getfattr(xdr, res->fattr, res->server); + status = decode_getfattr_label(xdr, res->fattr, + res->label, res->server); out: return status; } @@ -6036,7 +6142,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server); + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -6065,7 +6171,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server); + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -6097,7 +6203,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->fattr, res->server); + status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -6230,7 +6336,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, goto out; if (res->access_request) decode_access(xdr, &res->access_supported, &res->access_result); - decode_getfattr(xdr, res->f_attr, res->server); + decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server); out: return status; } @@ -6307,7 +6413,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, status = decode_setattr(xdr); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server); + decode_getfattr_label(xdr, res->fattr, res->label, res->server); out: return status; } @@ -6696,7 +6802,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, NULL, res->fs_locations, - res->fs_locations->server); + NULL, res->fs_locations->server); out: return status; } @@ -7109,7 +7215,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, goto out_overflow; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, - NULL, entry->server) < 0) + NULL, entry->label, entry->server) < 0) goto out_overflow; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index a9ebd81..e4f9cbf 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, pd.pgbase = 0; pd.pglen = PAGE_SIZE; pd.mincount = 0; + pd.maxcount = PAGE_SIZE; - err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); + err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd, + pnfslay->plh_lc_cred); dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); if (err) goto err_out; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c5bd758e..3a3a79d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } EXPORT_SYMBOL_GPL(pnfs_put_lseg); -static inline u64 +static u64 end_offset(u64 start, u64 len) { u64 end; @@ -376,9 +376,9 @@ end_offset(u64 start, u64 len) * start2 end2 * [----------------) */ -static inline int -lo_seg_contained(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +static bool +pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { u64 start1 = l1->offset; u64 end1 = end_offset(start1, l1->length); @@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1, * start2 end2 * [----------------) */ -static inline int -lo_seg_intersecting(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +static bool +pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { u64 start1 = l1->offset; u64 end1 = end_offset(start1, l1->length); @@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1, } static bool -should_free_lseg(struct pnfs_layout_range *lseg_range, - struct pnfs_layout_range *recall_range) +should_free_lseg(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) { return (recall_range->iomode == IOMODE_ANY || lseg_range->iomode == recall_range->iomode) && - lo_seg_intersecting(lseg_range, recall_range); + pnfs_lseg_range_intersecting(lseg_range, recall_range); } static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, @@ -766,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; /* Synchronously retrieve layout information from server and * store in lseg. @@ -860,6 +861,7 @@ _pnfs_return_layout(struct inode *ino) lrp->args.inode = ino; lrp->args.layout = lo; lrp->clp = NFS_SERVER(ino)->nfs_client; + lrp->cred = lo->plh_lc_cred; status = nfs4_proc_layoutreturn(lrp); out: @@ -984,8 +986,8 @@ out: * are seen first. */ static s64 -cmp_layout(struct pnfs_layout_range *l1, - struct pnfs_layout_range *l2) +pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { s64 d; @@ -1012,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, dprintk("%s:Begin\n", __func__); list_for_each_entry(lp, &lo->plh_segs, pls_list) { - if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) + if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) continue; list_add_tail(&lseg->pls_list, &lp->pls_list); dprintk("%s: inserted lseg %p " @@ -1050,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino, INIT_LIST_HEAD(&lo->plh_segs); INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; - lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); + lo->plh_lc_cred = get_rpccred(ctx->cred); return lo; } @@ -1091,21 +1093,21 @@ out_existing: * READ READ true * READ RW true */ -static int -is_matching_lseg(struct pnfs_layout_range *ls_range, - struct pnfs_layout_range *range) +static bool +pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, + const struct pnfs_layout_range *range) { struct pnfs_layout_range range1; if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || - !lo_seg_intersecting(ls_range, range)) + !pnfs_lseg_range_intersecting(ls_range, range)) return 0; /* range1 covers only the first byte in the range */ range1 = *range; range1.length = 1; - return lo_seg_contained(ls_range, &range1); + return pnfs_lseg_range_contained(ls_range, &range1); } /* @@ -1121,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && - is_matching_lseg(&lseg->pls_range, range)) { + pnfs_lseg_range_match(&lseg->pls_range, range)) { ret = pnfs_get_lseg(lseg); break; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f5f8a47..a4f4181 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -149,9 +149,10 @@ struct pnfs_device { struct nfs4_deviceid dev_id; unsigned int layout_type; unsigned int mincount; + unsigned int maxcount; /* gdia_maxcount */ struct page **pages; unsigned int pgbase; - unsigned int pglen; + unsigned int pglen; /* reply buffer length */ }; #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 @@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server, const struct nfs_fh *fh, struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, - struct pnfs_device *dev); + struct pnfs_device *dev, + struct rpc_cred *cred); extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index fc8de90..c041c41 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, struct nfs4_label *label) { struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], @@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, static int nfs_proc_lookup(struct inode *dir, struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), @@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); nfs_free_createdata(data); out: dprintk("NFS reply create: %d\n", status); @@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); } if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); nfs_free_createdata(data); out: dprintk("NFS reply mknod: %d\n", status); @@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, * should fill in the data with a LOOKUP call on the wire. */ if (status == 0) - status = nfs_instantiate(dentry, fh, fattr); + status = nfs_instantiate(dentry, fh, fattr, NULL); out_free: nfs_free_fattr(fattr); @@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); nfs_free_createdata(data); out: dprintk("NFS reply mkdir: %d\n", status); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a366107..71fdc0d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = { enum { Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, - Opt_vers_4_1, + Opt_vers_4_1, Opt_vers_4_2, Opt_vers_err }; @@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = { { Opt_vers_4, "4" }, { Opt_vers_4_0, "4.0" }, { Opt_vers_4_1, "4.1" }, + { Opt_vers_4_2, "4.2" }, { Opt_vers_err, NULL } }; @@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) seq_printf(m, "\n\tnfsv4:\t"); seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]); seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); show_sessions(m, nfss); show_pnfs(m, nfss); @@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string, mnt->version = 4; mnt->minorversion = 1; break; + case Opt_vers_4_2: + mnt->version = 4; + mnt->minorversion = 2; + break; default: return 0; } @@ -1608,29 +1614,13 @@ out_security_failure: } /* - * Select a security flavor for this mount. The selected flavor - * is planted in args->auth_flavors[0]. - * - * Returns 0 on success, -EACCES on failure. + * Ensure that the specified authtype in args->auth_flavors[0] is supported by + * the server. Returns 0 if it's ok, and -EACCES if not. */ -static int nfs_select_flavor(struct nfs_parsed_mount_data *args, - struct nfs_mount_request *request) +static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args, + rpc_authflavor_t *server_authlist, unsigned int count) { - unsigned int i, count = *(request->auth_flav_len); - rpc_authflavor_t flavor; - - /* - * The NFSv2 MNT operation does not return a flavor list. - */ - if (args->mount_server.version != NFS_MNT3_VERSION) - goto out_default; - - /* - * Certain releases of Linux's mountd return an empty - * flavor list in some cases. - */ - if (count == 0) - goto out_default; + unsigned int i; /* * If the sec= mount option is used, the specified flavor or AUTH_NULL @@ -1640,60 +1630,19 @@ static int nfs_select_flavor(struct nfs_parsed_mount_data *args, * means that the server will ignore the rpc creds, so any flavor * can be used. */ - if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) { - for (i = 0; i < count; i++) { - if (args->auth_flavors[0] == request->auth_flavs[i] || - request->auth_flavs[i] == RPC_AUTH_NULL) - goto out; - } - dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n", - args->auth_flavors[0]); - goto out_err; - } - - /* - * RFC 2623, section 2.7 suggests we SHOULD prefer the - * flavor listed first. However, some servers list - * AUTH_NULL first. Avoid ever choosing AUTH_NULL. - */ for (i = 0; i < count; i++) { - struct rpcsec_gss_info info; - - flavor = request->auth_flavs[i]; - switch (flavor) { - case RPC_AUTH_UNIX: - goto out_set; - case RPC_AUTH_NULL: - continue; - default: - if (rpcauth_get_gssinfo(flavor, &info) == 0) - goto out_set; - } + if (args->auth_flavors[0] == server_authlist[i] || + server_authlist[i] == RPC_AUTH_NULL) + goto out; } - /* - * As a last chance, see if the server list contains AUTH_NULL - - * if it does, use the default flavor. - */ - for (i = 0; i < count; i++) { - if (request->auth_flavs[i] == RPC_AUTH_NULL) - goto out_default; - } - - dfprintk(MOUNT, "NFS: no auth flavors in common with server\n"); - goto out_err; + dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n", + args->auth_flavors[0]); + return -EACCES; -out_default: - /* use default if flavor not already set */ - flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ? - RPC_AUTH_UNIX : args->auth_flavors[0]; -out_set: - args->auth_flavors[0] = flavor; out: - dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]); + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); return 0; -out_err: - return -EACCES; } /* @@ -1701,10 +1650,10 @@ out_err: * corresponding to the provided path. */ static int nfs_request_mount(struct nfs_parsed_mount_data *args, - struct nfs_fh *root_fh) + struct nfs_fh *root_fh, + rpc_authflavor_t *server_authlist, + unsigned int *server_authlist_len) { - rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; - unsigned int server_authlist_len = ARRAY_SIZE(server_authlist); struct nfs_mount_request request = { .sap = (struct sockaddr *) &args->mount_server.address, @@ -1712,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, .protocol = args->mount_server.protocol, .fh = root_fh, .noresvport = args->flags & NFS_MOUNT_NORESVPORT, - .auth_flav_len = &server_authlist_len, + .auth_flav_len = server_authlist_len, .auth_flavs = server_authlist, .net = args->net, }; @@ -1756,24 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, return status; } - return nfs_select_flavor(args, &request); + return 0; } -struct dentry *nfs_try_mount(int flags, const char *dev_name, - struct nfs_mount_info *mount_info, - struct nfs_subversion *nfs_mod) +static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) { int status; - struct nfs_server *server; + unsigned int i; + bool tried_auth_unix = false; + bool auth_null_in_list = false; + struct nfs_server *server = ERR_PTR(-EACCES); + struct nfs_parsed_mount_data *args = mount_info->parsed; + rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS]; + unsigned int authlist_len = ARRAY_SIZE(authlist); + + status = nfs_request_mount(args, mount_info->mntfh, authlist, + &authlist_len); + if (status) + return ERR_PTR(status); - if (mount_info->parsed->need_mount) { - status = nfs_request_mount(mount_info->parsed, mount_info->mntfh); + /* + * Was a sec= authflavor specified in the options? First, verify + * whether the server supports it, and then just try to use it if so. + */ + if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) { + status = nfs_verify_authflavor(args, authlist, authlist_len); + dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]); if (status) return ERR_PTR(status); + return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + } + + /* + * No sec= option was provided. RFC 2623, section 2.7 suggests we + * SHOULD prefer the flavor listed first. However, some servers list + * AUTH_NULL first. Avoid ever choosing AUTH_NULL. + */ + for (i = 0; i < authlist_len; ++i) { + rpc_authflavor_t flavor; + struct rpcsec_gss_info info; + + flavor = authlist[i]; + switch (flavor) { + case RPC_AUTH_UNIX: + tried_auth_unix = true; + break; + case RPC_AUTH_NULL: + auth_null_in_list = true; + continue; + default: + if (rpcauth_get_gssinfo(flavor, &info) != 0) + continue; + /* Fallthrough */ + } + dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); + args->auth_flavors[0] = flavor; + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + if (!IS_ERR(server)) + return server; } - /* Get a volume representation */ - server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + /* + * Nothing we tried so far worked. At this point, give up if we've + * already tried AUTH_UNIX or if the server's list doesn't contain + * AUTH_NULL + */ + if (tried_auth_unix || !auth_null_in_list) + return server; + + /* Last chance! Try AUTH_UNIX */ + dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); + args->auth_flavors[0] = RPC_AUTH_UNIX; + return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); +} + +struct dentry *nfs_try_mount(int flags, const char *dev_name, + struct nfs_mount_info *mount_info, + struct nfs_subversion *nfs_mod) +{ + struct nfs_server *server; + + if (mount_info->parsed->need_mount) + server = nfs_try_mount_request(mount_info, nfs_mod); + else + server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + if (IS_ERR(server)) return ERR_CAST(server); @@ -1942,6 +1959,7 @@ static int nfs23_validate_mount_data(void *options, args->namlen = data->namlen; args->bsize = data->bsize; + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->flags & NFS_MOUNT_SECFLAVOUR) args->auth_flavors[0] = data->pseudoflavor; if (!args->nfs_server.hostname) @@ -2411,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server) int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { - return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts); + int error; + unsigned long kflags = 0, kflags_out = 0; + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + + error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts, + kflags, &kflags_out); + if (error) + goto err; + + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; +err: + return error; } EXPORT_SYMBOL_GPL(nfs_set_sb_security); @@ -2637,6 +2669,7 @@ static int nfs4_validate_mount_data(void *options, goto out_no_address; args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->auth_flavourlen) { if (data->auth_flavourlen > 1) goto out_inval_auth; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 1f1f38f..60395ad 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - dentry->d_count); + d_count(dentry)); nfs_inc_stats(dir, NFSIOS_SILLYRENAME); /* diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 4e9a21d..105a3b0 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -240,11 +240,16 @@ struct name_list { struct list_head list; }; +struct nfs4_dir_ctx { + struct dir_context ctx; + struct list_head names; +}; + static int nfsd4_build_namelist(void *arg, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct list_head *names = arg; + struct nfs4_dir_ctx *ctx = arg; struct name_list *entry; if (namlen != HEXDIR_LEN - 1) @@ -254,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen, return -ENOMEM; memcpy(entry->name, name, HEXDIR_LEN - 1); entry->name[HEXDIR_LEN - 1] = '\0'; - list_add(&entry->list, names); + list_add(&entry->list, &ctx->names); return 0; } @@ -263,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) { const struct cred *original_cred; struct dentry *dir = nn->rec_file->f_path.dentry; - LIST_HEAD(names); + struct nfs4_dir_ctx ctx = { + .ctx.actor = nfsd4_build_namelist, + .names = LIST_HEAD_INIT(ctx.names) + }; int status; status = nfs4_save_creds(&original_cred); @@ -276,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) return status; } - status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names); + status = iterate_dir(nn->rec_file, &ctx.ctx); mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - while (!list_empty(&names)) { + while (!list_empty(&ctx.names)) { struct name_list *entry; - entry = list_entry(names.next, struct name_list, list); + entry = list_entry(ctx.names.next, struct name_list, list); if (!status) { struct dentry *dentry; dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 25ae6ce..280acef 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2723,13 +2723,13 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); - /* only place dl_time is set. protected by lock_flocks*/ + /* Only place dl_time is set; protected by i_lock: */ dp->dl_time = get_seconds(); nfsd4_cb_recall(dp); } -/* Called from break_lease() with lock_flocks() held. */ +/* Called from break_lease() with i_lock held. */ static void nfsd_break_deleg_cb(struct file_lock *fl) { struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; @@ -4599,7 +4599,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) struct inode *inode = filp->fi_inode; int status = 0; - lock_flocks(); + spin_lock(&inode->i_lock); for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { if ((*flpp)->fl_owner == (fl_owner_t)lowner) { status = 1; @@ -4607,7 +4607,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) } } out: - unlock_flocks(); + spin_unlock(&inode->i_lock); return status; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 1e757fa..8ff6a00 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1940,6 +1940,7 @@ struct buffered_dirent { }; struct readdir_data { + struct dir_context ctx; char *dirent; size_t used; int full; @@ -1971,13 +1972,15 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, struct readdir_cd *cdp, loff_t *offsetp) { - struct readdir_data buf; struct buffered_dirent *de; int host_err; int size; loff_t offset; + struct readdir_data buf = { + .ctx.actor = nfsd_buffered_filldir, + .dirent = (void *)__get_free_page(GFP_KERNEL) + }; - buf.dirent = (void *)__get_free_page(GFP_KERNEL); if (!buf.dirent) return nfserrno(-ENOMEM); @@ -1991,7 +1994,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, buf.used = 0; buf.full = 0; - host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf); + host_err = iterate_dir(file, &buf.ctx); if (buf.full) host_err = 0; diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index eed4d7b..741fd02 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, } /** + * nilfs_palloc_count_desc_blocks - count descriptor blocks number + * @inode: inode of metadata file using this allocator + * @desc_blocks: descriptor blocks number [out] + */ +static int nilfs_palloc_count_desc_blocks(struct inode *inode, + unsigned long *desc_blocks) +{ + unsigned long blknum; + int ret; + + ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum); + if (likely(!ret)) + *desc_blocks = DIV_ROUND_UP( + blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block); + return ret; +} + +/** + * nilfs_palloc_mdt_file_can_grow - check potential opportunity for + * MDT file growing + * @inode: inode of metadata file using this allocator + * @desc_blocks: known current descriptor blocks count + */ +static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode, + unsigned long desc_blocks) +{ + return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) < + nilfs_palloc_groups_count(inode); +} + +/** + * nilfs_palloc_count_max_entries - count max number of entries that can be + * described by descriptor blocks count + * @inode: inode of metadata file using this allocator + * @nused: current number of used entries + * @nmaxp: max number of entries [out] + */ +int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) +{ + unsigned long desc_blocks = 0; + u64 entries_per_desc_block, nmax; + int err; + + err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks); + if (unlikely(err)) + return err; + + entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) * + nilfs_palloc_groups_per_desc_block(inode); + nmax = entries_per_desc_block * desc_blocks; + + if (nused == nmax && + nilfs_palloc_mdt_file_can_grow(inode, desc_blocks)) + nmax += entries_per_desc_block; + + if (nused > nmax) + return -ERANGE; + + *nmaxp = nmax; + return 0; +} + +/** * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index fb72381..4bd6451 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int, void *nilfs_palloc_block_get_entry(const struct inode *, __u64, const struct buffer_head *, void *); +int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *); + /** * nilfs_palloc_req - persistent allocator request and reply * @pr_entry_nr: entry number (vblocknr or inode number) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index f30b017..197a63e 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } -static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int nilfs_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = file_inode(filp); + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); /* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ - unsigned char *types = NULL; - int ret; if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) - goto success; - - types = nilfs_filetype_table; + return 0; for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; @@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (IS_ERR(page)) { nilfs_error(sb, __func__, "bad page in #%lu", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; - ret = -EIO; - goto done; + ctx->pos += PAGE_CACHE_SIZE - offset; + return -EIO; } kaddr = page_address(page); de = (struct nilfs_dir_entry *)(kaddr + offset); @@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (de->rec_len == 0) { nilfs_error(sb, __func__, "zero-length directory entry"); - ret = -EIO; nilfs_put_page(page); - goto done; + return -EIO; } if (de->inode) { - int over; - unsigned char d_type = DT_UNKNOWN; + unsigned char t; - if (types && de->file_type < NILFS_FT_MAX) - d_type = types[de->file_type]; + if (de->file_type < NILFS_FT_MAX) + t = nilfs_filetype_table[de->file_type]; + else + t = DT_UNKNOWN; - offset = (char *)de - kaddr; - over = filldir(dirent, de->name, de->name_len, - (n<<PAGE_CACHE_SHIFT) | offset, - le64_to_cpu(de->inode), d_type); - if (over) { + if (!dir_emit(ctx, de->name, de->name_len, + le64_to_cpu(de->inode), t)) { nilfs_put_page(page); - goto success; + return 0; } } - filp->f_pos += nilfs_rec_len_from_disk(de->rec_len); + ctx->pos += nilfs_rec_len_from_disk(de->rec_len); } nilfs_put_page(page); } - -success: - ret = 0; -done: - return ret; + return 0; } /* @@ -678,7 +666,7 @@ not_empty: const struct file_operations nilfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = nilfs_readdir, + .iterate = nilfs_readdir, .unlocked_ioctl = nilfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = nilfs_compat_ioctl, diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index d8e65bd..6548c78 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, } /** + * nilfs_ifile_count_free_inodes - calculate free inodes count + * @ifile: ifile inode + * @nmaxinodes: current maximum of available inodes count [out] + * @nfreeinodes: free inodes count [out] + */ +int nilfs_ifile_count_free_inodes(struct inode *ifile, + u64 *nmaxinodes, u64 *nfreeinodes) +{ + u64 nused; + int err; + + *nmaxinodes = 0; + *nfreeinodes = 0; + + nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count); + err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes); + if (likely(!err)) + *nfreeinodes = *nmaxinodes - nused; + return err; +} + +/** * nilfs_ifile_read - read or get ifile inode * @sb: super block instance * @root: root object diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 59b6f2b..679674d 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); int nilfs_ifile_delete_inode(struct inode *, ino_t); int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); +int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *); + int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, size_t inode_size, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 689fb60..b1a5277 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n) inode_add_bytes(inode, (1 << inode->i_blkbits) * n); if (root) - atomic_add(n, &root->blocks_count); + atomic64_add(n, &root->blocks_count); } void nilfs_inode_sub_blocks(struct inode *inode, int n) @@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n) inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); if (root) - atomic_sub(n, &root->blocks_count); + atomic64_sub(n, &root->blocks_count); } /** @@ -219,13 +219,32 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) static int nilfs_set_page_dirty(struct page *page) { - int ret = __set_page_dirty_buffers(page); + int ret = __set_page_dirty_nobuffers(page); - if (ret) { + if (page_has_buffers(page)) { struct inode *inode = page->mapping->host; - unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + unsigned nr_dirty = 0; + struct buffer_head *bh, *head; - nilfs_set_file_dirty(inode, nr_dirty); + /* + * This page is locked by callers, and no other thread + * concurrently marks its buffers dirty since they are + * only dirtied through routines in fs/buffer.c in + * which call sites of mark_buffer_dirty are protected + * by page lock. + */ + bh = head = page_buffers(page); + do { + /* Do not mark hole blocks dirty */ + if (buffer_dirty(bh) || !buffer_mapped(bh)) + continue; + + set_buffer_dirty(bh); + nr_dirty++; + } while (bh = bh->b_this_page, bh != head); + + if (nr_dirty) + nilfs_set_file_dirty(inode, nr_dirty); } return ret; } @@ -350,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) goto failed_ifile_create_inode; /* reference count of i_bh inherits from nilfs_mdt_read_block() */ - atomic_inc(&root->inodes_count); + atomic64_inc(&root->inodes_count); inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -782,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode) ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); if (!ret) - atomic_dec(&ii->i_root->inodes_count); + atomic64_dec(&ii->i_root->inodes_count); nilfs_clear_inode(inode); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index a5752a58..bd88a74 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) raw_cp->cp_snapshot_list.ssl_next = 0; raw_cp->cp_snapshot_list.ssl_prev = 0; raw_cp->cp_inodes_count = - cpu_to_le64(atomic_read(&sci->sc_root->inodes_count)); + cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count)); raw_cp->cp_blocks_count = - cpu_to_le64(atomic_read(&sci->sc_root->blocks_count)); + cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count)); raw_cp->cp_nblk_inc = cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c7d1f9f..af3ba04 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, if (err) goto failed_bh; - atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); - atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); + atomic64_set(&root->inodes_count, + le64_to_cpu(raw_cp->cp_inodes_count)); + atomic64_set(&root->blocks_count, + le64_to_cpu(raw_cp->cp_blocks_count)); nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); @@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) unsigned long overhead; unsigned long nrsvblocks; sector_t nfreeblocks; + u64 nmaxinodes, nfreeinodes; int err; /* @@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) if (unlikely(err)) return err; + err = nilfs_ifile_count_free_inodes(root->ifile, + &nmaxinodes, &nfreeinodes); + if (unlikely(err)) { + printk(KERN_WARNING + "NILFS warning: fail to count free inodes: err %d.\n", + err); + if (err == -ERANGE) { + /* + * If nilfs_palloc_count_max_entries() returns + * -ERANGE error code then we simply treat + * curent inodes count as maximum possible and + * zero as free inodes value. + */ + nmaxinodes = atomic64_read(&root->inodes_count); + nfreeinodes = 0; + err = 0; + } else + return err; + } + buf->f_type = NILFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = blocks - overhead; buf->f_bfree = nfreeblocks; buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? (buf->f_bfree - nrsvblocks) : 0; - buf->f_files = atomic_read(&root->inodes_count); - buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ + buf->f_files = nmaxinodes; + buf->f_ffree = nfreeinodes; buf->f_namelen = NILFS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -973,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, static int nilfs_tree_was_touched(struct dentry *root_dentry) { - return root_dentry->d_count > 1; + return d_count(root_dentry) > 1; } /** diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 41e6a04..94c451c 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) new->ifile = NULL; new->nilfs = nilfs; atomic_set(&new->count, 1); - atomic_set(&new->inodes_count, 0); - atomic_set(&new->blocks_count, 0); + atomic64_set(&new->inodes_count, 0); + atomic64_set(&new->blocks_count, 0); rb_link_node(&new->rb_node, parent, p); rb_insert_color(&new->rb_node, &nilfs->ns_cptree); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index be1267a..de8cc53 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -241,8 +241,8 @@ struct nilfs_root { struct the_nilfs *nilfs; struct inode *ifile; - atomic_t inodes_count; - atomic_t blocks_count; + atomic64_t inodes_count; + atomic64_t blocks_count; }; /* Special checkpoint number */ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 2bfe6dc..1fedd5f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1; static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; static struct fsnotify_group *dnotify_group __read_mostly; -static DEFINE_MUTEX(dnotify_mark_mutex); /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which @@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); - mutex_lock(&dnotify_mark_mutex); + mutex_lock(&dnotify_group->mark_mutex); spin_lock(&fsn_mark->lock); prev = &dn_mark->dn; @@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id) spin_unlock(&fsn_mark->lock); - /* nothing else could have found us thanks to the dnotify_mark_mutex */ + /* nothing else could have found us thanks to the dnotify_groups + mark_mutex */ if (dn_mark->dn == NULL) - fsnotify_destroy_mark(fsn_mark, dnotify_group); + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - mutex_unlock(&dnotify_mark_mutex); + mutex_unlock(&dnotify_group->mark_mutex); fsnotify_put_mark(fsn_mark); } @@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) new_dn_mark->dn = NULL; /* this is needed to prevent the fcntl/close race described below */ - mutex_lock(&dnotify_mark_mutex); + mutex_lock(&dnotify_group->mark_mutex); /* add the new_fsn_mark or find an old one. */ fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); @@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0); + fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode, + NULL, 0); spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; @@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed - * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the - * only time we clean up the marks we need to get our mark off - * the list. */ + * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the + * fd is the only time we clean up the marks we need to get our mark + * off the list. */ if (f != filp) { /* if we added ourselves, shoot ourselves, it's possible that * the flush actually did shoot this fsn_mark. That's fine too @@ -385,9 +386,9 @@ out: spin_unlock(&fsn_mark->lock); if (destroy) - fsnotify_destroy_mark(fsn_mark, dnotify_group); + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - mutex_unlock(&dnotify_mark_mutex); + mutex_unlock(&dnotify_group->mark_mutex); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6c80083..e44cb64 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group, metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; + metadata->reserved = 0; metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); if (unlikely(event->mask & FAN_Q_OVERFLOW)) @@ -399,9 +400,6 @@ static int fanotify_release(struct inode *ignored, struct file *file) wake_up(&group->fanotify_data.access_waitq); #endif - if (file->f_flags & FASYNC) - fsnotify_fasync(-1, file, 0); - /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_destroy_group(group); @@ -526,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOENT; + } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark, group); + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) @@ -550,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOENT; + } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark, group); + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); + /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); if (removed & inode->i_fsnotify_mask) @@ -593,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, return mask & ~oldmask; } +static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, + struct inode *inode, + struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + int ret; + + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return ERR_PTR(-ENOSPC); + + mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!mark) + return ERR_PTR(-ENOMEM); + + fsnotify_init_mark(mark, fanotify_free_mark); + ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0); + if (ret) { + fsnotify_put_mark(mark); + return ERR_PTR(ret); + } + + return mark; +} + + static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, unsigned int flags) { struct fsnotify_mark *fsn_mark; __u32 added; - int ret = 0; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) - return -ENOSPC; - - fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) - return -ENOMEM; - - fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); - if (ret) - goto err; + fsn_mark = fanotify_add_new_mark(group, NULL, mnt); + if (IS_ERR(fsn_mark)) { + mutex_unlock(&group->mark_mutex); + return PTR_ERR(fsn_mark); + } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); if (added & ~real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); -err: + fsnotify_put_mark(fsn_mark); - return ret; + return 0; } static int fanotify_add_inode_mark(struct fsnotify_group *group, @@ -630,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; - int ret = 0; pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -644,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, (atomic_read(&inode->i_writecount) > 0)) return 0; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) - return -ENOSPC; - - fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) - return -ENOMEM; - - fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); - if (ret) - goto err; + fsn_mark = fanotify_add_new_mark(group, inode, NULL); + if (IS_ERR(fsn_mark)) { + mutex_unlock(&group->mark_mutex); + return PTR_ERR(fsn_mark); + } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); -err: + fsnotify_put_mark(fsn_mark); - return ret; + return 0; } /* fanotify syscalls */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 959815c..60f954a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group, goto out_err; /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); + ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, + NULL, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); @@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod { int ret = 0; -retry: + mutex_lock(&group->mark_mutex); /* try to update and existing watch with the new arg */ ret = inotify_update_existing_watch(group, inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) ret = inotify_new_watch(group, inode, arg); - /* - * inotify_new_watch could race with another thread which did an - * inotify_new_watch between the update_existing and the add watch - * here, go back and try to update an existing mark again. - */ - if (ret == -EEXIST) - goto retry; + mutex_unlock(&group->mark_mutex); return ret; } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fc6b49b..923fe4a 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -20,28 +20,29 @@ * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: - * The mark->refcnt tells how many "things" in the kernel currently are - * referencing this object. The object typically will live inside the kernel - * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task - * which can find this object holding the appropriete locks, can take a reference - * and the object itself is guaranteed to survive until the reference is dropped. + * The group->recnt and mark->refcnt tell how many "things" in the kernel + * currently are referencing the objects. Both kind of objects typically will + * live inside the kernel with a refcnt of 2, one for its creation and one for + * the reference a group and a mark hold to each other. + * If you are holding the appropriate locks, you can take a reference and the + * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: - * There are 3 spinlocks involved with fsnotify inode marks and they MUST - * be taken in order as follows: + * There are 3 locks involved with fsnotify inode marks and they MUST be taken + * in order as follows: * + * group->mark_mutex * mark->lock - * group->mark_lock * inode->i_lock * - * mark->lock protects 2 things, mark->group and mark->inode. You must hold - * that lock to dereference either of these things (they could be NULL even with - * the lock) - * - * group->mark_lock protects the marks_list anchored inside a given group - * and each mark is hooked via the g_list. It also sorta protects the - * free_g_list, which when used is anchored by a private list on the stack of the - * task which held the group->mark_lock. + * group->mark_mutex protects the marks_list anchored inside a given group and + * each mark is hooked via the g_list. It also protects the groups private + * data (i.e group limits). + + * mark->lock protects the marks attributes like its masks and flags. + * Furthermore it protects the access to a reference of the group that the mark + * is assigned to as well as the access to a reference of the inode/vfsmount + * that is being watched by the mark. * * inode->i_lock protects the i_fsnotify_marks list anchored inside a * given inode and each mark is hooked via the i_list. (and sorta the @@ -64,18 +65,11 @@ * inode. We take i_lock and walk the i_fsnotify_marks safely. For each * mark on the list we take a reference (so the mark can't disappear under us). * We remove that mark form the inode's list of marks and we add this mark to a - * private list anchored on the stack using i_free_list; At this point we no - * longer fear anything finding the mark using the inode's list of marks. - * - * We can safely and locklessly run the private list on the stack of everything - * we just unattached from the original inode. For each mark on the private list - * we grab the mark-> and can thus dereference mark->group and mark->inode. If - * we see the group and inode are not NULL we take those locks. Now holding all - * 3 locks we can completely remove the mark from other tasks finding it in the - * future. Remember, 10 things might already be referencing this mark, but they - * better be holding a ref. We drop our reference we took before we unhooked it - * from the inode. When the ref hits 0 we can free the mark. - * + * private list anchored on the stack using i_free_list; we walk i_free_list + * and before we destroy the mark we make sure that we dont race with a + * concurrent destroy_group by getting a ref to the marks group and taking the + * groups mutex. + * Very similarly for freeing by group, except we use free_g_list. * * This has the very interesting property of being able to run concurrently with diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index fa9c05f..d267ea6 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1372,7 +1372,7 @@ retry_writepage: * The page may have dirty, unmapped buffers. Make them * freeable here, so the page does not leak. */ - block_invalidatepage(page, 0); + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); ntfs_debug("Write outside i_size - truncated?"); return 0; diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index aa411c3..9e38daf 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1004,13 +1004,11 @@ dir_err_out: /** * ntfs_filldir - ntfs specific filldir method * @vol: current ntfs volume - * @fpos: position in the directory * @ndir: ntfs inode of current directory * @ia_page: page in which the index allocation buffer @ie is in resides * @ie: current index entry * @name: buffer to use for the converted name - * @dirent: vfs filldir callback context - * @filldir: vfs filldir callback + * @actor: what to feed the entries to * * Convert the Unicode @name to the loaded NLS and pass it to the @filldir * callback. @@ -1024,12 +1022,12 @@ dir_err_out: * retake the lock if we are returning a non-zero value as ntfs_readdir() * would need to drop the lock immediately anyway. */ -static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, +static inline int ntfs_filldir(ntfs_volume *vol, ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, - u8 *name, void *dirent, filldir_t filldir) + u8 *name, struct dir_context *actor) { unsigned long mref; - int name_len, rc; + int name_len; unsigned dt_type; FILE_NAME_TYPE_FLAGS name_type; @@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, if (ia_page) unlock_page(ia_page); ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " - "0x%lx, DT_%s.", name, name_len, fpos, mref, + "0x%lx, DT_%s.", name, name_len, actor->pos, mref, dt_type == DT_DIR ? "DIR" : "REG"); - rc = filldir(dirent, name, name_len, fpos, mref, dt_type); + if (!dir_emit(actor, name, name_len, mref, dt_type)) + return 1; /* Relock the page but not if we are aborting ->readdir. */ - if (!rc && ia_page) + if (ia_page) lock_page(ia_page); - return rc; + return 0; } /* @@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, * removes them again after the write is complete after which it * unlocks the page. */ -static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ntfs_readdir(struct file *file, struct dir_context *actor) { s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; - loff_t fpos, i_size; - struct inode *bmp_vi, *vdir = file_inode(filp); + loff_t i_size; + struct inode *bmp_vi, *vdir = file_inode(file); struct super_block *sb = vdir->i_sb; ntfs_inode *ndir = NTFS_I(vdir); ntfs_volume *vol = NTFS_SB(sb); @@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) u8 *kaddr, *bmp, *index_end; ntfs_attr_search_ctx *ctx; - fpos = filp->f_pos; ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", - vdir->i_ino, fpos); + vdir->i_ino, actor->pos); rc = err = 0; /* Are we at end of dir yet? */ i_size = i_size_read(vdir); - if (fpos >= i_size + vol->mft_record_size) - goto done; + if (actor->pos >= i_size + vol->mft_record_size) + return 0; /* Emulate . and .. for all directories. */ - if (!fpos) { - ntfs_debug("Calling filldir for . with len 1, fpos 0x0, " - "inode 0x%lx, DT_DIR.", vdir->i_ino); - rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR); - if (rc) - goto done; - fpos++; - } - if (fpos == 1) { - ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, " - "inode 0x%lx, DT_DIR.", - (unsigned long)parent_ino(filp->f_path.dentry)); - rc = filldir(dirent, "..", 2, fpos, - parent_ino(filp->f_path.dentry), DT_DIR); - if (rc) - goto done; - fpos++; - } + if (!dir_emit_dots(file, actor)) + return 0; m = NULL; ctx = NULL; /* @@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) goto err_out; } /* Are we jumping straight into the index allocation attribute? */ - if (fpos >= vol->mft_record_size) + if (actor->pos >= vol->mft_record_size) goto skip_index_root; /* Get hold of the mft record for the directory. */ m = map_mft_record(ndir); @@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) goto err_out; } /* Get the offset into the index root attribute. */ - ir_pos = (s64)fpos; + ir_pos = (s64)actor->pos; /* Find the index root attribute in the mft record. */ err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, 0, ctx); @@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (ir_pos > (u8*)ie - (u8*)ir) continue; /* Advance the position even if going to skip the entry. */ - fpos = (u8*)ie - (u8*)ir; + actor->pos = (u8*)ie - (u8*)ir; /* Submit the name to the filldir callback. */ - rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent, - filldir); + rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); if (rc) { kfree(ir); goto abort; @@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (!NInoIndexAllocPresent(ndir)) goto EOD; /* Advance fpos to the beginning of the index allocation. */ - fpos = vol->mft_record_size; + actor->pos = vol->mft_record_size; skip_index_root: kaddr = NULL; prev_ia_pos = -1LL; /* Get the offset into the index allocation attribute. */ - ia_pos = (s64)fpos - vol->mft_record_size; + ia_pos = (s64)actor->pos - vol->mft_record_size; ia_mapping = vdir->i_mapping; ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); @@ -1409,7 +1390,7 @@ find_next_index_buffer: if (ia_pos - ia_start > (u8*)ie - (u8*)ia) continue; /* Advance the position even if going to skip the entry. */ - fpos = (u8*)ie - (u8*)ia + + actor->pos = (u8*)ie - (u8*)ia + (sle64_to_cpu(ia->index_block_vcn) << ndir->itype.index.vcn_size_bits) + vol->mft_record_size; @@ -1419,8 +1400,7 @@ find_next_index_buffer: * before returning, unless a non-zero value is returned in * which case the page is left unlocked. */ - rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent, - filldir); + rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); if (rc) { /* @ia_page is already unlocked in this case. */ ntfs_unmap_page(ia_page); @@ -1439,18 +1419,9 @@ unm_EOD: iput(bmp_vi); EOD: /* We are finished, set fpos to EOD. */ - fpos = i_size + vol->mft_record_size; + actor->pos = i_size + vol->mft_record_size; abort: kfree(name); -done: -#ifdef DEBUG - if (!rc) - ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos); - else - ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.", - rc, fpos); -#endif - filp->f_pos = fpos; return 0; err_out: if (bmp_page) { @@ -1471,7 +1442,6 @@ iput_err_out: if (!err) err = -EIO; ntfs_debug("Failed. Returning error code %i.", -err); - filp->f_pos = fpos; return err; } @@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, const struct file_operations ntfs_dir_ops = { .llseek = generic_file_llseek, /* Seek inside directory. */ .read = generic_read_dir, /* Return -EISDIR. */ - .readdir = ntfs_readdir, /* Read directory contents. */ + .iterate = ntfs_readdir, /* Read directory contents. */ #ifdef NTFS_RW .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ /*.aio_fsync = ,*/ /* Sync all outstanding async diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b8a9d87..17e6bdd 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode, &ref_tree, NULL); if (ret) { mlog_errno(ret); - goto out; + goto bail; } ret = ocfs2_prepare_refcount_change_for_del(inode, @@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode, &extra_blocks); if (ret < 0) { mlog_errno(ret); - goto out; + goto bail; } } @@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode, extra_blocks); if (ret) { mlog_errno(ret); - return ret; + goto bail; } mutex_lock(&tl_inode->i_mutex); @@ -5734,7 +5734,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out: mutex_unlock(&tl_inode->i_mutex); - +bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 20dfec7..79736a2 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, * from ext3. PageChecked() bits have been removed as OCFS2 does not * do journalled data. */ -static void ocfs2_invalidatepage(struct page *page, unsigned long offset) +static void ocfs2_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - jbd2_journal_invalidatepage(journal, page, offset); + jbd2_journal_invalidatepage(journal, page, offset, length); } static int ocfs2_releasepage(struct page *page, gfp_t wait) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 42252bf..5c1c864 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold) } } -static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) +static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode) { int ret = -1; @@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, } atomic_inc(&write_wc->wc_num_reqs); - submit_bio(WRITE, bio); + submit_bio(WRITE_SYNC, bio); status = 0; bail: @@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) continue; - ret = o2hb_global_hearbeat_mode_set(i); + ret = o2hb_global_heartbeat_mode_set(i); if (!ret) printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", o2hb_heartbeat_mode_desc[i]); @@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { NULL, }; -static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { +static struct configfs_item_operations o2hb_heartbeat_group_item_ops = { .show_attribute = o2hb_heartbeat_group_show, .store_attribute = o2hb_heartbeat_group_store, }; @@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { static struct config_item_type o2hb_heartbeat_group_type = { .ct_group_ops = &o2hb_heartbeat_group_group_ops, - .ct_item_ops = &o2hb_hearbeat_group_item_ops, + .ct_item_ops = &o2hb_heartbeat_group_item_ops, .ct_attrs = o2hb_heartbeat_group_attrs, .ct_owner = THIS_MODULE, }; @@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid) assert_spin_locked(&o2hb_live_lock); list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + uuid = config_item_name(®->hr_item); /* local heartbeat */ @@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid) assert_spin_locked(&o2hb_live_lock); list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + uuid = config_item_name(®->hr_item); if (region_uuid) { if (strcmp(region_uuid, uuid)) @@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions) p = region_uuids; list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + mlog(0, "Region: %s\n", config_item_name(®->hr_item)); if (numregs < max_regions) { memcpy(p, config_item_name(®->hr_item), diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index c19897d..1ec141e 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node) /* This is analogous to hb_up. as a node's connection comes up we delay the * quorum decision until we see it heartbeating. the hold will be droped in * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if - * it's already heartbeating we we might be dropping a hold that conn_up got. + * it's already heartbeating we might be dropping a hold that conn_up got. * */ void o2quo_conn_up(u8 node) { diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index aa88bd8..d644dc6 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref) sc->sc_node = NULL; o2net_debug_del_sc(sc); + + if (sc->sc_page) + __free_page(sc->sc_page); kfree(sc); } @@ -630,19 +633,19 @@ static void o2net_state_change(struct sock *sk) state_change = sc->sc_state_change; switch(sk->sk_state) { - /* ignore connecting sockets as they make progress */ - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - case TCP_ESTABLISHED: - o2net_sc_queue_work(sc, &sc->sc_connect_work); - break; - default: - printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT - " shutdown, state %d\n", - SC_NODEF_ARGS(sc), sk->sk_state); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); - break; + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + o2net_sc_queue_work(sc, &sc->sc_connect_work); + break; + default: + printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT + " shutdown, state %d\n", + SC_NODEF_ARGS(sc), sk->sk_state); + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); + break; } out: read_unlock(&sk->sk_callback_lock); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index f1e1aed..eb760d8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1761,11 +1761,10 @@ bail: static int ocfs2_dir_foreach_blk_id(struct inode *inode, u64 *f_version, - loff_t *f_pos, void *priv, - filldir_t filldir, int *filldir_err) + struct dir_context *ctx) { - int ret, i, filldir_ret; - unsigned long offset = *f_pos; + int ret, i; + unsigned long offset = ctx->pos; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_inline_data *data; @@ -1781,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, di = (struct ocfs2_dinode *)di_bh->b_data; data = &di->id2.i_data; - while (*f_pos < i_size_read(inode)) { -revalidate: + while (ctx->pos < i_size_read(inode)) { /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block @@ -1802,50 +1800,31 @@ revalidate: break; i += le16_to_cpu(de->rec_len); } - *f_pos = offset = i; + ctx->pos = offset = i; *f_version = inode->i_version; } - de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos); - if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) { + de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos); + if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) { /* On error, skip the f_pos to the end. */ - *f_pos = i_size_read(inode); - goto out; + ctx->pos = i_size_read(inode); + break; } offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = *f_version; unsigned char d_type = DT_UNKNOWN; if (de->file_type < OCFS2_FT_MAX) d_type = ocfs2_filetype_table[de->file_type]; - filldir_ret = filldir(priv, de->name, - de->name_len, - *f_pos, - le64_to_cpu(de->inode), - d_type); - if (filldir_ret) { - if (filldir_err) - *filldir_err = filldir_ret; - break; - } - if (version != *f_version) - goto revalidate; + if (!dir_emit(ctx, de->name, de->name_len, + le64_to_cpu(de->inode), d_type)) + goto out; } - *f_pos += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); } - out: brelse(di_bh); - return 0; } @@ -1855,27 +1834,26 @@ out: */ static int ocfs2_dir_foreach_blk_el(struct inode *inode, u64 *f_version, - loff_t *f_pos, void *priv, - filldir_t filldir, int *filldir_err) + struct dir_context *ctx, + bool persist) { - int error = 0; unsigned long offset, blk, last_ra_blk = 0; - int i, stored; + int i; struct buffer_head * bh, * tmp; struct ocfs2_dir_entry * de; struct super_block * sb = inode->i_sb; unsigned int ra_sectors = 16; + int stored = 0; - stored = 0; bh = NULL; - offset = (*f_pos) & (sb->s_blocksize - 1); + offset = ctx->pos & (sb->s_blocksize - 1); - while (!error && !stored && *f_pos < i_size_read(inode)) { - blk = (*f_pos) >> sb->s_blocksize_bits; + while (ctx->pos < i_size_read(inode)) { + blk = ctx->pos >> sb->s_blocksize_bits; if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { /* Skip the corrupt dirblock and keep trying */ - *f_pos += sb->s_blocksize - offset; + ctx->pos += sb->s_blocksize - offset; continue; } @@ -1897,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, ra_sectors = 8; } -revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block @@ -1917,93 +1894,64 @@ revalidate: i += le16_to_cpu(de->rec_len); } offset = i; - *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1)) + ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; *f_version = inode->i_version; } - while (!error && *f_pos < i_size_read(inode) + while (ctx->pos < i_size_read(inode) && offset < sb->s_blocksize) { de = (struct ocfs2_dir_entry *) (bh->b_data + offset); if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { /* On error, skip the f_pos to the next block. */ - *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1; + ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; brelse(bh); - goto out; + continue; } - offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - unsigned long version = *f_version; unsigned char d_type = DT_UNKNOWN; if (de->file_type < OCFS2_FT_MAX) d_type = ocfs2_filetype_table[de->file_type]; - error = filldir(priv, de->name, + if (!dir_emit(ctx, de->name, de->name_len, - *f_pos, le64_to_cpu(de->inode), - d_type); - if (error) { - if (filldir_err) - *filldir_err = error; - break; + d_type)) { + brelse(bh); + return 0; } - if (version != *f_version) - goto revalidate; - stored ++; + stored++; } - *f_pos += le16_to_cpu(de->rec_len); + offset += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); } offset = 0; brelse(bh); bh = NULL; + if (!persist && stored) + break; } - - stored = 0; -out: - return stored; + return 0; } static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, - loff_t *f_pos, void *priv, filldir_t filldir, - int *filldir_err) + struct dir_context *ctx, + bool persist) { if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) - return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv, - filldir, filldir_err); - - return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir, - filldir_err); + return ocfs2_dir_foreach_blk_id(inode, f_version, ctx); + return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist); } /* * This is intended to be called from inside other kernel functions, * so we fake some arguments. */ -int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, - filldir_t filldir) +int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx) { - int ret = 0, filldir_err = 0; u64 version = inode->i_version; - - while (*f_pos < i_size_read(inode)) { - ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv, - filldir, &filldir_err); - if (ret || filldir_err) - break; - } - - if (ret > 0) - ret = -EIO; - + ocfs2_dir_foreach_blk(inode, &version, ctx, true); return 0; } @@ -2011,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, * ocfs2_readdir() * */ -int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) +int ocfs2_readdir(struct file *file, struct dir_context *ctx) { int error = 0; - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); int lock_level = 0; trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention @@ -2035,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) goto bail_nolock; } - error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, - dirent, filldir, NULL); + error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false); ocfs2_inode_unlock(inode, lock_level); if (error) @@ -2120,6 +2067,7 @@ bail: } struct ocfs2_empty_dir_priv { + struct dir_context ctx; unsigned seen_dot; unsigned seen_dot_dot; unsigned seen_other; @@ -2204,8 +2152,9 @@ out: int ocfs2_empty_dir(struct inode *inode) { int ret; - loff_t start = 0; - struct ocfs2_empty_dir_priv priv; + struct ocfs2_empty_dir_priv priv = { + .ctx.actor = ocfs2_empty_dir_filldir + }; memset(&priv, 0, sizeof(priv)); @@ -2219,7 +2168,7 @@ int ocfs2_empty_dir(struct inode *inode) */ } - ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); + ret = ocfs2_dir_foreach(inode, &priv.ctx); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index e683f3d..f0344b7 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name, struct ocfs2_dir_lookup_result *res); int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, int namelen, u64 *blkno); -int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); -int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, - filldir_t filldir); +int ocfs2_readdir(struct file *file, struct dir_context *ctx); +int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx); int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, struct inode *dir, struct buffer_head *parent_fe_bh, diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 975810b..47e67c2 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, lock->ml.node); } } else { + status = DLM_NORMAL; dlm_lock_get(lock); list_add_tail(&lock->list, &res->blocked); kick_thread = 1; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b3fdd1a..773bd32 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -55,9 +55,6 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_recovery_thread(void *data); -void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); -int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); -void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); static int dlm_do_recovery(struct dlm_ctxt *dlm); static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); @@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, u8 dead_node) { struct dlm_lock_request lr; - enum dlm_status ret; + int ret; mlog(0, "\n"); @@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, lr.dead_node = dead_node; // send message - ret = DLM_NOLOCKMGR; ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, &lr, sizeof(lr), request_from, NULL); @@ -1408,6 +1404,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; @@ -2695,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, dlm->name, br->node_idx, br->dead_node, dlm->reco.dead_node, dlm->reco.new_master); spin_unlock(&dlm->spinlock); + dlm_put(dlm); return -EAGAIN; } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 1c39efb..2487116 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -790,7 +790,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, &hole_size, &rec, &is_last); if (ret) { mlog_errno(ret); - goto out; + goto out_unlock; } if (rec.e_blkno == 0ULL) { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8a7509f..41000f2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2288,7 +2288,7 @@ relock: ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { mlog_errno(ret); - goto out_sems; + goto out; } ocfs2_inode_unlock(inode, 1); @@ -2646,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) goto out; } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - ret = -EINVAL; - if (!ret && offset > inode->i_sb->s_maxbytes) - ret = -EINVAL; - if (ret) - goto out; - - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: mutex_unlock(&inode->i_mutex); @@ -2712,7 +2702,7 @@ const struct file_operations ocfs2_fops = { const struct file_operations ocfs2_dops = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = ocfs2_readdir, + .iterate = ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, @@ -2759,7 +2749,7 @@ const struct file_operations ocfs2_fops_no_plocks = { const struct file_operations ocfs2_dops_no_plocks = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = ocfs2_readdir, + .iterate = ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 8eccfab..242170d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1941,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) } struct ocfs2_orphan_filldir_priv { + struct dir_context ctx; struct inode *head; struct ocfs2_super *osb; }; @@ -1977,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, { int status; struct inode *orphan_dir_inode = NULL; - struct ocfs2_orphan_filldir_priv priv; - loff_t pos = 0; - - priv.osb = osb; - priv.head = *head; + struct ocfs2_orphan_filldir_priv priv = { + .ctx.actor = ocfs2_orphan_filldir, + .osb = osb, + .head = *head + }; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, @@ -1999,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, goto out; } - status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv, - ocfs2_orphan_filldir); + status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx); if (status) { mlog_errno(status); goto out_cluster; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index a3385b6..96f9ac2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb); static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) { - atomic_set(&osb->needs_checkpoint, 1); wake_up(&osb->checkpoint_event); } @@ -538,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + - ocfs2_quota_trans_credits(sb); + ocfs2_quota_trans_credits(sb) + bits_wanted; } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 04ee1b5..be3f867 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); - le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); + fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); fe->i_atime = fe->i_ctime = fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = @@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry) return ret; } -static inline int inode_is_unlinkable(struct inode *inode) +static inline int ocfs2_inode_is_unlinkable(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { if (inode->i_nlink == 2) @@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir, { int status; int child_locked = 0; + bool is_unlinkable = false; struct inode *inode = dentry->d_inode; struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); @@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - if (inode_is_unlinkable(inode)) { + if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, orphan_name, &orphan_insert); @@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir, mlog_errno(status); goto leave; } + is_unlinkable = true; } handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); @@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir, fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (inode_is_unlinkable(inode)) { - status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto leave; - } - } - /* delete the name from the parent dir */ status = ocfs2_delete_entry(handle, dir, &lookup); if (status < 0) { @@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir, mlog_errno(status); if (S_ISDIR(inode->i_mode)) inc_nlink(dir); + goto leave; + } + + if (is_unlinkable) { + status = ocfs2_orphan_add(osb, handle, inode, fe_bh, + orphan_name, &orphan_insert, orphan_dir); + if (status < 0) + mlog_errno(status); } leave: @@ -947,7 +948,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status) + if (status && (status != -ENOTEMPTY)) mlog_errno(status); return status; @@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto leave; } + /* + * We're going to journal the change of i_flags and i_orphaned_slot. + * It's safe anyway, though some callers may duplicate the journaling. + * Journaling within the func just make the logic look more + * straightforward. + */ + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + /* we're a cluster, and nlink can change on disk from * underneath us... */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; @@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, orphan_dir_bh, lookup); if (status < 0) { mlog_errno(status); - goto leave; + goto rollback; } - /* - * We're going to journal the change of i_flags and i_orphaned_slot. - * It's safe anyway, though some callers may duplicate the journaling. - * Journaling within the func just make the logic look more - * straightforward. - */ - status = ocfs2_journal_access_di(handle, - INODE_CACHE(inode), - fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; /* Record which orphan dir our inode now resides @@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); +rollback: + if (status < 0) { + if (S_ISDIR(inode->i_mode)) + ocfs2_add_links_count(orphan_fe, -1); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); + } + leave: brelse(orphan_dir_bh); - if (status) - mlog_errno(status); return status; } @@ -2216,7 +2222,7 @@ out: brelse(orphan_dir_bh); - return 0; + return ret; } int ocfs2_create_inode_in_orphan(struct inode *dir, @@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } di = (struct ocfs2_dinode *)di_bh->b_data; - le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); + di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d355e6e..3a90347 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -347,7 +347,6 @@ struct ocfs2_super struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; - atomic_t needs_checkpoint; struct ocfs2_journal *journal; unsigned long osb_commit_interval; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index b7e74b5..5397c07 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle, int status; /* there is a really tiny chance the journal calls could fail, * but we wouldn't want inconsistent blocks in *any* case. */ - u64 fe_ptr, bg_ptr, prev_bg_ptr; + u64 bg_ptr, prev_bg_ptr; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; @@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle, (unsigned long long)le64_to_cpu(bg->bg_blkno), (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); - fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), prev_bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + if (status < 0) + goto out; prev_bg->bg_next_group = bg->bg_next_group; ocfs2_journal_dirty(handle, prev_bg_bh); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + if (status < 0) + goto out_rollback_prev_bg; bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; ocfs2_journal_dirty(handle, bg_bh); status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + if (status < 0) + goto out_rollback_bg; fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; ocfs2_journal_dirty(handle, fe_bh); -out_rollback: - if (status < 0) { - fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); - bg->bg_next_group = cpu_to_le64(bg_ptr); - prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); - } - - if (status) +out: + if (status < 0) mlog_errno(status); return status; + +out_rollback_bg: + bg->bg_next_group = cpu_to_le64(bg_ptr); +out_rollback_prev_bg: + prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); + goto out; } static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 01b8516..854d809 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) spin_unlock(&osb->osb_lock); out += snprintf(buf + out, len - out, - "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", + "%10s => Pid: %d Interval: %lu\n", "Commit", (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), - osb->osb_commit_interval, - atomic_read(&osb->needs_checkpoint)); + osb->osb_commit_interval); out += snprintf(buf + out, len - out, "%10s => State: %d TxnId: %lu NumTxns: %d\n", @@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb, } init_waitqueue_head(&osb->checkpoint_event); - atomic_set(&osb->needs_checkpoint, 0); osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 2e3ea30..317ef0a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, { int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; struct ocfs2_xa_loc loc; if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) @@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, down_write(&oi->ip_alloc_sem); if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { - if (!ocfs2_xattr_has_space_inline(inode, di)) { - ret = -ENOSPC; - goto out; - } - } - - if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); if (ret) { if (ret != -ENOSPC) @@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) } new_oi = OCFS2_I(args->new_inode); + /* + * Adjust extent record count to reserve space for extended attribute. + * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). + */ + if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && + !(ocfs2_inode_is_fast_symlink(args->new_inode))) { + struct ocfs2_extent_list *el = &new_di->id2.i_list; + le16_add_cpu(&el->l_count, -(inline_size / + sizeof(struct ocfs2_extent_rec))); + } spin_lock(&new_oi->ip_lock); new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index acbaebc..1b8e9e8 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, return is_bad; } -static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, +static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, u64 fsblock, int hindex) { - struct inode *dir = file_inode(filp); - struct buffer_head *bh; - struct omfs_inode *oi; - u64 self; - int res = 0; - unsigned char d_type; - /* follow chain in this bucket */ while (fsblock != ~0) { - bh = omfs_bread(dir->i_sb, fsblock); + struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock); + struct omfs_inode *oi; + u64 self; + unsigned char d_type; + if (!bh) - goto out; + return true; oi = (struct omfs_inode *) bh->b_data; if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) { brelse(bh); - goto out; + return true; } self = fsblock; @@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG; - res = filldir(dirent, oi->i_name, strnlen(oi->i_name, - OMFS_NAMELEN), filp->f_pos, self, d_type); + if (!dir_emit(ctx, oi->i_name, + strnlen(oi->i_name, OMFS_NAMELEN), + self, d_type)) { + brelse(bh); + return false; + } brelse(bh); - if (res < 0) - break; - filp->f_pos++; + ctx->pos++; } -out: - return res; + return true; } static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -403,60 +401,44 @@ out: return err; } -static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int omfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *dir = file_inode(filp); + struct inode *dir = file_inode(file); struct buffer_head *bh; - loff_t offset, res; + __be64 *p; unsigned int hchain, hindex; int nbuckets; - u64 fsblock; - int ret = -EINVAL; - - if (filp->f_pos >> 32) - goto success; - - switch ((unsigned long) filp->f_pos) { - case 0: - if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0) - goto success; - filp->f_pos++; - /* fall through */ - case 1: - if (filldir(dirent, "..", 2, 1, - parent_ino(filp->f_dentry), DT_DIR) < 0) - goto success; - filp->f_pos = 1 << 20; - /* fall through */ + + if (ctx->pos >> 32) + return -EINVAL; + + if (ctx->pos < 1 << 20) { + if (!dir_emit_dots(file, ctx)) + return 0; + ctx->pos = 1 << 20; } nbuckets = (dir->i_size - OMFS_DIR_START) / 8; /* high 12 bits store bucket + 1 and low 20 bits store hash index */ - hchain = (filp->f_pos >> 20) - 1; - hindex = filp->f_pos & 0xfffff; + hchain = (ctx->pos >> 20) - 1; + hindex = ctx->pos & 0xfffff; bh = omfs_bread(dir->i_sb, dir->i_ino); if (!bh) - goto out; + return -EINVAL; - offset = OMFS_DIR_START + hchain * 8; + p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain; - for (; hchain < nbuckets; hchain++, offset += 8) { - fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset])); - - res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex); - hindex = 0; - if (res < 0) + for (; hchain < nbuckets; hchain++) { + __u64 fsblock = be64_to_cpu(*p++); + if (!omfs_fill_chain(dir, ctx, fsblock, hindex)) break; - - filp->f_pos = (hchain+2) << 20; + hindex = 0; + ctx->pos = (hchain+2) << 20; } brelse(bh); -success: - ret = 0; -out: - return ret; + return 0; } const struct inode_operations omfs_dir_inops = { @@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = { const struct file_operations omfs_dir_operations = { .read = generic_read_dir, - .readdir = omfs_readdir, + .iterate = omfs_readdir, .llseek = generic_file_llseek, }; @@ -840,11 +840,15 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o if (flags & __O_SYNC) flags |= O_DSYNC; - /* - * If we have O_PATH in the open flag. Then we - * cannot have anything other than the below set of flags - */ - if (flags & O_PATH) { + if (flags & O_TMPFILE) { + if (!(flags & O_CREAT)) + return -EINVAL; + acc_mode = MAY_OPEN | ACC_MODE(flags); + } else if (flags & O_PATH) { + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; acc_mode = 0; } else { @@ -876,7 +880,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - return lookup_flags; + op->lookup_flags = lookup_flags; + return 0; } /** @@ -893,8 +898,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; - int lookup = build_open_flags(flags, mode, &op); - return do_filp_open(AT_FDCWD, name, &op, lookup); + int err = build_open_flags(flags, mode, &op); + return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); } /** @@ -919,37 +924,43 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, const char *filename, int flags) { struct open_flags op; - int lookup = build_open_flags(flags, 0, &op); + int err = build_open_flags(flags, 0, &op); + if (err) + return ERR_PTR(err); if (flags & O_CREAT) return ERR_PTR(-EINVAL); if (!filename && (flags & O_DIRECTORY)) if (!dentry->d_inode->i_op->lookup) return ERR_PTR(-ENOTDIR); - return do_file_open_root(dentry, mnt, filename, &op, lookup); + return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; - int lookup = build_open_flags(flags, mode, &op); - struct filename *tmp = getname(filename); - int fd = PTR_ERR(tmp); - - if (!IS_ERR(tmp)) { - fd = get_unused_fd_flags(flags); - if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, &op, lookup); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else { - fsnotify_open(f); - fd_install(fd, f); - } + int fd = build_open_flags(flags, mode, &op); + struct filename *tmp; + + if (fd) + return fd; + + tmp = getname(filename); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + fd = get_unused_fd_flags(flags); + if (fd >= 0) { + struct file *f = do_filp_open(dfd, tmp, &op); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else { + fsnotify_open(f); + fd_install(fd, f); } - putname(tmp); } + putname(tmp); return fd; } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 75885ff..8c0ceb8 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = { .release = seq_release, }; -static int openpromfs_readdir(struct file *, void *, filldir_t); +static int openpromfs_readdir(struct file *, struct dir_context *); static const struct file_operations openprom_operations = { .read = generic_read_dir, - .readdir = openpromfs_readdir, + .iterate = openpromfs_readdir, .llseek = generic_file_llseek, }; @@ -260,71 +260,64 @@ found: return NULL; } -static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int openpromfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct op_inode_info *oi = OP_I(inode); struct device_node *dp = oi->u.node; struct device_node *child; struct property *prop; - unsigned int ino; int i; mutex_lock(&op_mutex); - ino = inode->i_ino; - i = filp->f_pos; - switch (i) { - case 0: - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + if (ctx->pos == 0) { + if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR)) goto out; - i++; - filp->f_pos++; - /* fall thru */ - case 1: - if (filldir(dirent, "..", 2, i, + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (!dir_emit(ctx, "..", 2, (dp->parent == NULL ? OPENPROM_ROOT_INO : - dp->parent->unique_id), DT_DIR) < 0) + dp->parent->unique_id), DT_DIR)) goto out; - i++; - filp->f_pos++; - /* fall thru */ - default: - i -= 2; - - /* First, the children nodes as directories. */ - child = dp->child; - while (i && child) { - child = child->sibling; - i--; - } - while (child) { - if (filldir(dirent, - child->path_component_name, - strlen(child->path_component_name), - filp->f_pos, child->unique_id, DT_DIR) < 0) - goto out; - - filp->f_pos++; - child = child->sibling; - } + ctx->pos = 2; + } + i = ctx->pos - 2; - /* Next, the properties as files. */ - prop = dp->properties; - while (i && prop) { - prop = prop->next; - i--; - } - while (prop) { - if (filldir(dirent, prop->name, strlen(prop->name), - filp->f_pos, prop->unique_id, DT_REG) < 0) - goto out; + /* First, the children nodes as directories. */ + child = dp->child; + while (i && child) { + child = child->sibling; + i--; + } + while (child) { + if (!dir_emit(ctx, + child->path_component_name, + strlen(child->path_component_name), + child->unique_id, DT_DIR)) + goto out; - filp->f_pos++; - prop = prop->next; - } + ctx->pos++; + child = child->sibling; + } + + /* Next, the properties as files. */ + prop = dp->properties; + while (i && prop) { + prop = prop->next; + i--; } + while (prop) { + if (!dir_emit(ctx, prop->name, strlen(prop->name), + prop->unique_id, DT_REG)) + goto out; + + ctx->pos++; + prop = prop->next; + } + out: mutex_unlock(&op_mutex); return 0; @@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt) if (peer_mnt == mnt) peer_mnt = NULL; } - if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && + list_empty(&mnt->mnt_share)) mnt_release_group_id(mnt); list_del_init(&mnt->mnt_share); diff --git a/fs/proc/base.c b/fs/proc/base.c index dd51e50..1485e38 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1681,46 +1681,34 @@ const struct dentry_operations pid_dentry_operations = * reported by readdir in sync with the inode numbers reported * by stat. */ -int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, +bool proc_fill_cache(struct file *file, struct dir_context *ctx, const char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { - struct dentry *child, *dir = filp->f_path.dentry; + struct dentry *child, *dir = file->f_path.dentry; + struct qstr qname = QSTR_INIT(name, len); struct inode *inode; - struct qstr qname; - ino_t ino = 0; - unsigned type = DT_UNKNOWN; - - qname.name = name; - qname.len = len; - qname.hash = full_name_hash(name, len); + unsigned type; + ino_t ino; - child = d_lookup(dir, &qname); + child = d_hash_and_lookup(dir, &qname); if (!child) { - struct dentry *new; - new = d_alloc(dir, &qname); - if (new) { - child = instantiate(dir->d_inode, new, task, ptr); - if (child) - dput(new); - else - child = new; + child = d_alloc(dir, &qname); + if (!child) + goto end_instantiate; + if (instantiate(dir->d_inode, child, task, ptr) < 0) { + dput(child); + goto end_instantiate; } } - if (!child || IS_ERR(child) || !child->d_inode) - goto end_instantiate; inode = child->d_inode; - if (inode) { - ino = inode->i_ino; - type = inode->i_mode >> 12; - } + ino = inode->i_ino; + type = inode->i_mode >> 12; dput(child); + return dir_emit(ctx, name, len, ino, type); + end_instantiate: - if (!ino) - ino = find_inode_number(dir, &qname); - if (!ino) - ino = 1; - return filldir(dirent, name, len, filp->f_pos, ino, type); + return dir_emit(ctx, name, len, 1, DT_UNKNOWN); } #ifdef CONFIG_CHECKPOINT_RESTORE @@ -1846,7 +1834,7 @@ struct map_files_info { unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; -static struct dentry * +static int proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -1856,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) - return ERR_PTR(-ENOENT); + return -ENOENT; ei = PROC_I(inode); ei->op.proc_get_link = proc_map_files_get_link; @@ -1873,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, d_set_d_op(dentry, &tid_map_files_dentry_operations); d_add(dentry, inode); - return NULL; + return 0; } static struct dentry *proc_map_files_lookup(struct inode *dir, @@ -1882,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, unsigned long vm_start, vm_end; struct vm_area_struct *vma; struct task_struct *task; - struct dentry *result; + int result; struct mm_struct *mm; - result = ERR_PTR(-EPERM); + result = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto out; - result = ERR_PTR(-ENOENT); + result = -ENOENT; task = get_proc_task(dir); if (!task) goto out; - result = ERR_PTR(-EACCES); + result = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - result = ERR_PTR(-ENOENT); + result = -ENOENT; if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) goto out_put_task; @@ -1921,7 +1909,7 @@ out_no_vma: out_put_task: put_task_struct(task); out: - return result; + return ERR_PTR(result); } static const struct inode_operations proc_map_files_inode_operations = { @@ -1931,14 +1919,15 @@ static const struct inode_operations proc_map_files_inode_operations = { }; static int -proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) +proc_map_files_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; struct vm_area_struct *vma; struct task_struct *task; struct mm_struct *mm; - ino_t ino; + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; int ret; ret = -EPERM; @@ -1946,7 +1935,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; ret = -ENOENT; - task = get_proc_task(inode); + task = get_proc_task(file_inode(file)); if (!task) goto out; @@ -1955,91 +1944,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out_put_task; ret = 0; - switch (filp->f_pos) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) - goto out_put_task; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out_put_task; - filp->f_pos++; - default: - { - unsigned long nr_files, pos, i; - struct flex_array *fa = NULL; - struct map_files_info info; - struct map_files_info *p; - - mm = get_task_mm(task); - if (!mm) - goto out_put_task; - down_read(&mm->mmap_sem); + if (!dir_emit_dots(file, ctx)) + goto out_put_task; - nr_files = 0; + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + down_read(&mm->mmap_sem); - /* - * We need two passes here: - * - * 1) Collect vmas of mapped files with mmap_sem taken - * 2) Release mmap_sem and instantiate entries - * - * otherwise we get lockdep complained, since filldir() - * routine might require mmap_sem taken in might_fault(). - */ + nr_files = 0; - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { - if (vma->vm_file && ++pos > filp->f_pos) - nr_files++; - } + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ - if (nr_files) { - fa = flex_array_alloc(sizeof(info), nr_files, - GFP_KERNEL); - if (!fa || flex_array_prealloc(fa, 0, nr_files, - GFP_KERNEL)) { - ret = -ENOMEM; - if (fa) - flex_array_free(fa); - up_read(&mm->mmap_sem); - mmput(mm); - goto out_put_task; - } - for (i = 0, vma = mm->mmap, pos = 2; vma; - vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (++pos <= filp->f_pos) - continue; - - info.mode = vma->vm_file->f_mode; - info.len = snprintf(info.name, - sizeof(info.name), "%lx-%lx", - vma->vm_start, vma->vm_end); - if (flex_array_put(fa, i++, &info, GFP_KERNEL)) - BUG(); - } + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > ctx->pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_put_task; } - up_read(&mm->mmap_sem); - - for (i = 0; i < nr_files; i++) { - p = flex_array_get(fa, i); - ret = proc_fill_cache(filp, dirent, filldir, - p->name, p->len, - proc_map_files_instantiate, - task, - (void *)(unsigned long)p->mode); - if (ret) - break; - filp->f_pos++; + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; + + info.mode = vma->vm_file->f_mode; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); } - if (fa) - flex_array_free(fa); - mmput(mm); } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + if (!proc_fill_cache(file, ctx, + p->name, p->len, + proc_map_files_instantiate, + task, + (void *)(unsigned long)p->mode)) + break; + ctx->pos++; } + if (fa) + flex_array_free(fa); + mmput(mm); out_put_task: put_task_struct(task); @@ -2049,7 +2020,7 @@ out: static const struct file_operations proc_map_files_operations = { .read = generic_read_dir, - .readdir = proc_map_files_readdir, + .iterate = proc_map_files_readdir, .llseek = default_llseek, }; @@ -2118,6 +2089,7 @@ static int show_timer(struct seq_file *m, void *v) nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); + seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; } @@ -2151,13 +2123,12 @@ static const struct file_operations proc_timers_operations = { }; #endif /* CONFIG_CHECKPOINT_RESTORE */ -static struct dentry *proc_pident_instantiate(struct inode *dir, +static int proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) @@ -2176,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } static struct dentry *proc_pident_lookup(struct inode *dir, @@ -2186,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir, const struct pid_entry *ents, unsigned int nents) { - struct dentry *error; + int error; struct task_struct *task = get_proc_task(dir); const struct pid_entry *p, *last; - error = ERR_PTR(-ENOENT); + error = -ENOENT; if (!task) goto out_no_task; @@ -2213,70 +2184,33 @@ static struct dentry *proc_pident_lookup(struct inode *dir, out: put_task_struct(task); out_no_task: - return error; -} - -static int proc_pident_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) -{ - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, - proc_pident_instantiate, task, p); + return ERR_PTR(error); } -static int proc_pident_readdir(struct file *filp, - void *dirent, filldir_t filldir, +static int proc_pident_readdir(struct file *file, struct dir_context *ctx, const struct pid_entry *ents, unsigned int nents) { - int i; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - const struct pid_entry *p, *last; - ino_t ino; - int ret; + struct task_struct *task = get_proc_task(file_inode(file)); + const struct pid_entry *p; - ret = -ENOENT; if (!task) - goto out_no_task; + return -ENOENT; - ret = 0; - i = filp->f_pos; - switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= nents) { - ret = 1; - goto out; - } - p = ents + i; - last = &ents[nents - 1]; - while (p <= last) { - if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0) - goto out; - filp->f_pos++; - p++; - } - } + if (!dir_emit_dots(file, ctx)) + goto out; + + if (ctx->pos >= nents + 2) + goto out; - ret = 1; + for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { + if (!proc_fill_cache(file, ctx, p->name, p->len, + proc_pident_instantiate, task, p)) + break; + ctx->pos++; + } out: put_task_struct(task); -out_no_task: - return ret; + return 0; } #ifdef CONFIG_SECURITY @@ -2361,16 +2295,15 @@ static const struct pid_entry attr_dir_stuff[] = { REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), }; -static int proc_attr_dir_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); + return proc_pident_readdir(file, ctx, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); } static const struct file_operations proc_attr_dir_operations = { .read = generic_read_dir, - .readdir = proc_attr_dir_readdir, + .iterate = proc_attr_dir_readdir, .llseek = default_llseek, }; @@ -2724,16 +2657,15 @@ static const struct pid_entry tgid_base_stuff[] = { #endif }; -static int proc_tgid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); + return proc_pident_readdir(file, ctx, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } static const struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, - .readdir = proc_tgid_base_readdir, + .iterate = proc_tgid_base_readdir, .llseek = default_llseek, }; @@ -2835,11 +2767,10 @@ void proc_flush_task(struct task_struct *task) } } -static struct dentry *proc_pid_instantiate(struct inode *dir, - struct dentry * dentry, - struct task_struct *task, const void *ptr) +static int proc_pid_instantiate(struct inode *dir, + struct dentry * dentry, + struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); @@ -2859,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result = NULL; + int result = 0; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; @@ -2887,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign result = proc_pid_instantiate(dir, dentry, task, NULL); put_task_struct(task); out: - return result; + return ERR_PTR(result); } /* @@ -2935,58 +2866,42 @@ retry: #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) -static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - struct tgid_iter iter) -{ - char name[PROC_NUMBUF]; - int len = snprintf(name, sizeof(name), "%d", iter.tgid); - return proc_fill_cache(filp, dirent, filldir, name, len, - proc_pid_instantiate, iter.task, NULL); -} - -static int fake_filldir(void *buf, const char *name, int namelen, - loff_t offset, u64 ino, unsigned d_type) -{ - return 0; -} - /* for the /proc/ directory itself, after non-process stuff has been done */ -int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) +int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; - struct pid_namespace *ns; - filldir_t __filldir; - loff_t pos = filp->f_pos; + struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info; + loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) - goto out; + return 0; if (pos == TGID_OFFSET - 1) { - if (proc_fill_cache(filp, dirent, filldir, "self", 4, - NULL, NULL, NULL) < 0) - goto out; + struct inode *inode = ns->proc_self->d_inode; + if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + return 0; iter.tgid = 0; } else { iter.tgid = pos - TGID_OFFSET; } iter.task = NULL; - ns = filp->f_dentry->d_sb->s_fs_info; for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { - if (has_pid_permissions(ns, iter.task, 2)) - __filldir = filldir; - else - __filldir = fake_filldir; + char name[PROC_NUMBUF]; + int len; + if (!has_pid_permissions(ns, iter.task, 2)) + continue; - filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { + len = snprintf(name, sizeof(name), "%d", iter.tgid); + ctx->pos = iter.tgid + TGID_OFFSET; + if (!proc_fill_cache(file, ctx, name, len, + proc_pid_instantiate, iter.task, NULL)) { put_task_struct(iter.task); - goto out; + return 0; } } - filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; -out: + ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; return 0; } @@ -3074,11 +2989,10 @@ static const struct pid_entry tid_base_stuff[] = { #endif }; -static int proc_tid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) { - return proc_pident_readdir(filp,dirent,filldir, - tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); + return proc_pident_readdir(file, ctx, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -3089,7 +3003,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den static const struct file_operations proc_tid_base_operations = { .read = generic_read_dir, - .readdir = proc_tid_base_readdir, + .iterate = proc_tid_base_readdir, .llseek = default_llseek, }; @@ -3099,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = { .setattr = proc_setattr, }; -static struct dentry *proc_task_instantiate(struct inode *dir, +static int proc_task_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); struct inode *inode; inode = proc_pid_make_inode(dir->i_sb, task); @@ -3121,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result = ERR_PTR(-ENOENT); + int result = -ENOENT; struct task_struct *task; struct task_struct *leader = get_proc_task(dir); unsigned tid; @@ -3158,7 +3071,7 @@ out_drop_task: out: put_task_struct(leader); out_no_task: - return result; + return ERR_PTR(result); } /* @@ -3230,30 +3143,16 @@ static struct task_struct *next_tid(struct task_struct *start) return pos; } -static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - struct task_struct *task, int tid) -{ - char name[PROC_NUMBUF]; - int len = snprintf(name, sizeof(name), "%d", tid); - return proc_fill_cache(filp, dirent, filldir, name, len, - proc_task_instantiate, task, NULL); -} - /* for the /proc/TGID/task/ directories */ -static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int proc_task_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; struct task_struct *leader = NULL; - struct task_struct *task; - int retval = -ENOENT; - ino_t ino; - int tid; + struct task_struct *task = get_proc_task(file_inode(file)); struct pid_namespace *ns; + int tid; - task = get_proc_task(inode); if (!task) - goto out_no_task; + return -ENOENT; rcu_read_lock(); if (pid_alive(task)) { leader = task->group_leader; @@ -3262,46 +3161,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi rcu_read_unlock(); put_task_struct(task); if (!leader) - goto out_no_task; - retval = 0; + return -ENOENT; - switch ((unsigned long)filp->f_pos) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - /* fall through */ - } + if (!dir_emit_dots(file, ctx)) + goto out; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = filp->f_dentry->d_sb->s_fs_info; - tid = (int)filp->f_version; - filp->f_version = 0; - for (task = first_tid(leader, tid, filp->f_pos - 2, ns); + ns = file->f_dentry->d_sb->s_fs_info; + tid = (int)file->f_version; + file->f_version = 0; + for (task = first_tid(leader, tid, ctx->pos - 2, ns); task; - task = next_tid(task), filp->f_pos++) { + task = next_tid(task), ctx->pos++) { + char name[PROC_NUMBUF]; + int len; tid = task_pid_nr_ns(task, ns); - if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { + len = snprintf(name, sizeof(name), "%d", tid); + if (!proc_fill_cache(file, ctx, name, len, + proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - filp->f_version = (u64)tid; + file->f_version = (u64)tid; put_task_struct(task); break; } } out: put_task_struct(leader); -out_no_task: - return retval; + return 0; } static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) @@ -3327,6 +3216,6 @@ static const struct inode_operations proc_task_inode_operations = { static const struct file_operations proc_task_operations = { .read = generic_read_dir, - .readdir = proc_task_readdir, + .iterate = proc_task_readdir, .llseek = default_llseek, }; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index d7a4a28..75f2890 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) return ret; } -static struct dentry * +static int proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); unsigned fd = (unsigned long)ptr; struct proc_inode *ei; struct inode *inode; @@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry, /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, instantiate_t instantiate) { struct task_struct *task = get_proc_task(dir); - struct dentry *result = ERR_PTR(-ENOENT); + int result = -ENOENT; unsigned fd = name_to_int(dentry); if (!task) @@ -216,77 +215,61 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, out: put_task_struct(task); out_no_task: - return result; + return ERR_PTR(result); } -static int proc_readfd_common(struct file * filp, void * dirent, - filldir_t filldir, instantiate_t instantiate) +static int proc_readfd_common(struct file *file, struct dir_context *ctx, + instantiate_t instantiate) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *p = get_proc_task(inode); + struct task_struct *p = get_proc_task(file_inode(file)); struct files_struct *files; - unsigned int fd, ino; - int retval; + unsigned int fd; - retval = -ENOENT; if (!p) - goto out_no_task; - retval = 0; - - fd = filp->f_pos; - switch (fd) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - default: - files = get_files_struct(p); - if (!files) - goto out; - rcu_read_lock(); - for (fd = filp->f_pos - 2; - fd < files_fdtable(files)->max_fds; - fd++, filp->f_pos++) { - char name[PROC_NUMBUF]; - int len; - int rv; - - if (!fcheck_files(files, fd)) - continue; - rcu_read_unlock(); + return -ENOENT; - len = snprintf(name, sizeof(name), "%d", fd); - rv = proc_fill_cache(filp, dirent, filldir, - name, len, instantiate, p, - (void *)(unsigned long)fd); - if (rv < 0) - goto out_fd_loop; - rcu_read_lock(); - } - rcu_read_unlock(); -out_fd_loop: - put_files_struct(files); + if (!dir_emit_dots(file, ctx)) + goto out; + if (!dir_emit_dots(file, ctx)) + goto out; + files = get_files_struct(p); + if (!files) + goto out; + + rcu_read_lock(); + for (fd = ctx->pos - 2; + fd < files_fdtable(files)->max_fds; + fd++, ctx->pos++) { + char name[PROC_NUMBUF]; + int len; + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + len = snprintf(name, sizeof(name), "%d", fd); + if (!proc_fill_cache(file, ctx, + name, len, instantiate, p, + (void *)(unsigned long)fd)) + goto out_fd_loop; + rcu_read_lock(); } + rcu_read_unlock(); +out_fd_loop: + put_files_struct(files); out: put_task_struct(p); -out_no_task: - return retval; + return 0; } -static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) +static int proc_readfd(struct file *file, struct dir_context *ctx) { - return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); + return proc_readfd_common(file, ctx, proc_fd_instantiate); } const struct file_operations proc_fd_operations = { .read = generic_read_dir, - .readdir = proc_readfd, + .iterate = proc_readfd, .llseek = default_llseek, }; @@ -316,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = { .setattr = proc_setattr, }; -static struct dentry * +static int proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - struct dentry *error = ERR_PTR(-ENOENT); unsigned fd = (unsigned long)ptr; struct proc_inode *ei; struct inode *inode; @@ -340,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; + return -ENOENT; } static struct dentry * @@ -351,9 +333,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } -static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) +static int proc_readfdinfo(struct file *file, struct dir_context *ctx) { - return proc_readfd_common(filp, dirent, filldir, + return proc_readfd_common(file, ctx, proc_fdinfo_instantiate); } @@ -364,6 +346,6 @@ const struct inode_operations proc_fdinfo_inode_operations = { const struct file_operations proc_fdinfo_operations = { .read = generic_read_dir, - .readdir = proc_readfdinfo, + .iterate = proc_readfdinfo, .llseek = default_llseek, }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index a2596af..94441a4 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -233,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, - filldir_t filldir) +int proc_readdir_de(struct proc_dir_entry *de, struct file *file, + struct dir_context *ctx) { - unsigned int ino; int i; - struct inode *inode = file_inode(filp); - int ret = 0; - - ino = inode->i_ino; - i = filp->f_pos; - switch (i) { - case 0: - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - if (filldir(dirent, "..", 2, i, - parent_ino(filp->f_path.dentry), - DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - spin_lock(&proc_subdir_lock); - de = de->subdir; - i -= 2; - for (;;) { - if (!de) { - ret = 1; - spin_unlock(&proc_subdir_lock); - goto out; - } - if (!i) - break; - de = de->next; - i--; - } - do { - struct proc_dir_entry *next; - - /* filldir passes info to user space */ - pde_get(de); - spin_unlock(&proc_subdir_lock); - if (filldir(dirent, de->name, de->namelen, filp->f_pos, - de->low_ino, de->mode >> 12) < 0) { - pde_put(de); - goto out; - } - spin_lock(&proc_subdir_lock); - filp->f_pos++; - next = de->next; - pde_put(de); - de = next; - } while (de); + if (!dir_emit_dots(file, ctx)) + return 0; + + spin_lock(&proc_subdir_lock); + de = de->subdir; + i = ctx->pos - 2; + for (;;) { + if (!de) { spin_unlock(&proc_subdir_lock); + return 0; + } + if (!i) + break; + de = de->next; + i--; } - ret = 1; -out: - return ret; + + do { + struct proc_dir_entry *next; + pde_get(de); + spin_unlock(&proc_subdir_lock); + if (!dir_emit(ctx, de->name, de->namelen, + de->low_ino, de->mode >> 12)) { + pde_put(de); + return 0; + } + spin_lock(&proc_subdir_lock); + ctx->pos++; + next = de->next; + pde_put(de); + de = next; + } while (de); + spin_unlock(&proc_subdir_lock); + return 0; } -int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) +int proc_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); - return proc_readdir_de(PDE(inode), filp, dirent, filldir); + return proc_readdir_de(PDE(inode), file, ctx); } /* @@ -313,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) static const struct file_operations proc_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = proc_readdir, + .iterate = proc_readdir, }; /* diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d600fb0..651d09a 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -165,14 +165,14 @@ extern int proc_setattr(struct dentry *, struct iattr *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); extern int pid_revalidate(struct dentry *, unsigned int); extern int pid_delete_dentry(const struct dentry *); -extern int proc_pid_readdir(struct file *, void *, filldir_t); +extern int proc_pid_readdir(struct file *, struct dir_context *); extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, +typedef int instantiate_t(struct inode *, struct dentry *, struct task_struct *, const void *); -extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int, +extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, instantiate_t, struct task_struct *, const void *); /* @@ -183,8 +183,8 @@ extern spinlock_t proc_subdir_lock; extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, struct dentry *); -extern int proc_readdir(struct file *, void *, filldir_t); -extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t); +extern int proc_readdir(struct file *, struct dir_context *); +extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 0a22194..06ea155 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) prpsinfo.pr_zomb = 0; strcpy(prpsinfo.pr_fname, "vmlinux"); - strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); + strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); nhdr->p_filesz += notesize(¬es[1]); bufp = storenote(¬es[1], bufp); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bd4b5a7..bdfabda 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 54bdc67..49a7fff 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = { .setattr = proc_setattr, }; -static struct dentry *proc_ns_instantiate(struct inode *dir, +static int proc_ns_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) @@ -208,90 +207,52 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, 0)) - error = NULL; + return 0; out: - return error; -} - -static int proc_ns_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, - const struct proc_ns_operations *ops) -{ - return proc_fill_cache(filp, dirent, filldir, - ops->name, strlen(ops->name), - proc_ns_instantiate, task, ops); + return -ENOENT; } -static int proc_ns_dir_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { - int i; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(file_inode(file)); const struct proc_ns_operations **entry, **last; - ino_t ino; - int ret; - ret = -ENOENT; if (!task) - goto out_no_task; + return -ENOENT; - ret = 0; - i = filp->f_pos; - switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= ARRAY_SIZE(ns_entries)) { - ret = 1; - goto out; - } - entry = ns_entries + i; - last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; - while (entry <= last) { - if (proc_ns_fill_cache(filp, dirent, filldir, - task, *entry) < 0) - goto out; - filp->f_pos++; - entry++; - } + if (!dir_emit_dots(file, ctx)) + goto out; + if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) + goto out; + entry = ns_entries + (ctx->pos - 2); + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + const struct proc_ns_operations *ops = *entry; + if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops)) + break; + ctx->pos++; + entry++; } - - ret = 1; out: put_task_struct(task); -out_no_task: - return ret; + return 0; } const struct file_operations proc_ns_dir_operations = { .read = generic_read_dir, - .readdir = proc_ns_dir_readdir, + .iterate = proc_ns_dir_readdir, }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *error; + int error; struct task_struct *task = get_proc_task(dir); const struct proc_ns_operations **entry, **last; unsigned int len = dentry->d_name.len; - error = ERR_PTR(-ENOENT); + error = -ENOENT; if (!task) goto out_no_task; @@ -310,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, out: put_task_struct(task); out_no_task: - return error; + return ERR_PTR(error); } const struct inode_operations proc_ns_dir_inode_operations = { diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 986e832..4677bb7 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -160,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = { .getattr = proc_tgid_net_getattr, }; -static int proc_tgid_net_readdir(struct file *filp, void *dirent, - filldir_t filldir) +static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) { int ret; struct net *net; ret = -EINVAL; - net = get_proc_task_net(file_inode(filp)); + net = get_proc_task_net(file_inode(file)); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); + ret = proc_readdir_de(net->proc_net, file, ctx); put_net(net); } return ret; @@ -178,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, const struct file_operations proc_net_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = proc_tgid_net_readdir, + .iterate = proc_tgid_net_readdir, }; static __net_init int proc_net_ns_init(struct net *net) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ac05f33..7129046 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -573,12 +573,12 @@ out: return ret; } -static int proc_sys_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, +static bool proc_sys_fill_cache(struct file *file, + struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { - struct dentry *child, *dir = filp->f_path.dentry; + struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; @@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent, inode = proc_sys_make_inode(dir->d_sb, head, table); if (!inode) { dput(child); - return -ENOMEM; + return false; } else { d_set_d_op(child, &proc_sys_dentry_operations); d_add(child, inode); } } else { - return -ENOMEM; + return false; } } inode = child->d_inode; ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); - return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); + return dir_emit(ctx, qname.name, qname.len, ino, type); } -static int proc_sys_link_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, +static bool proc_sys_link_fill_cache(struct file *file, + struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { - int err, ret = 0; + bool ret = true; head = sysctl_head_grab(head); if (S_ISLNK(table->mode)) { /* It is not an error if we can not follow the link ignore it */ - err = sysctl_follow_link(&head, &table, current->nsproxy); + int err = sysctl_follow_link(&head, &table, current->nsproxy); if (err) goto out; } - ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); + ret = proc_sys_fill_cache(file, ctx, head, table); out: sysctl_head_finish(head); return ret; @@ -634,67 +634,50 @@ out: static int scan(struct ctl_table_header *head, ctl_table *table, unsigned long *pos, struct file *file, - void *dirent, filldir_t filldir) + struct dir_context *ctx) { - int res; + bool res; - if ((*pos)++ < file->f_pos) - return 0; + if ((*pos)++ < ctx->pos) + return true; if (unlikely(S_ISLNK(table->mode))) - res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); + res = proc_sys_link_fill_cache(file, ctx, head, table); else - res = proc_sys_fill_cache(file, dirent, filldir, head, table); + res = proc_sys_fill_cache(file, ctx, head, table); - if (res == 0) - file->f_pos = *pos; + if (res) + ctx->pos = *pos; return res; } -static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct ctl_table_header *head = grab_header(inode); + struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; - int ret = -EINVAL; if (IS_ERR(head)) return PTR_ERR(head); ctl_dir = container_of(head, struct ctl_dir, header); - ret = 0; - /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, - inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - } - if (filp->f_pos == 1) { - if (filldir(dirent, "..", 2, filp->f_pos, - parent_ino(dentry), DT_DIR) < 0) - goto out; - filp->f_pos++; - } + if (!dir_emit_dots(file, ctx)) + return 0; + pos = 2; for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { - ret = scan(h, entry, &pos, filp, dirent, filldir); - if (ret) { + if (!scan(h, entry, &pos, file, ctx)) { sysctl_head_finish(h); break; } } - ret = 1; -out: sysctl_head_finish(head); - return ret; + return 0; } static int proc_sys_permission(struct inode *inode, int mask) @@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = { static const struct file_operations proc_sys_dir_file_operations = { .read = generic_read_dir, - .readdir = proc_sys_readdir, + .iterate = proc_sys_readdir, .llseek = generic_file_llseek, }; @@ -813,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p) return res; } -static int proc_sys_compare(const struct dentry *parent, - const struct inode *pinode, - const struct dentry *dentry, const struct inode *inode, +static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct ctl_table_header *head; + struct inode *inode; + /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ /* AV: can it, indeed? */ + inode = ACCESS_ONCE(dentry->d_inode); if (!inode) return 1; if (name->len != len) diff --git a/fs/proc/root.c b/fs/proc/root.c index 41a6ea9..229e366 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -202,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr return proc_pid_lookup(dir, dentry, flags); } -static int proc_root_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int proc_root_readdir(struct file *file, struct dir_context *ctx) { - unsigned int nr = filp->f_pos; - int ret; - - if (nr < FIRST_PROCESS_ENTRY) { - int error = proc_readdir(filp, dirent, filldir); - if (error <= 0) - return error; - filp->f_pos = FIRST_PROCESS_ENTRY; + if (ctx->pos < FIRST_PROCESS_ENTRY) { + proc_readdir(file, ctx); + ctx->pos = FIRST_PROCESS_ENTRY; } - ret = proc_pid_readdir(filp, dirent, filldir); - return ret; + return proc_pid_readdir(file, ctx); } /* @@ -226,7 +219,7 @@ static int proc_root_readdir(struct file * filp, */ static const struct file_operations proc_root_operations = { .read = generic_read_dir, - .readdir = proc_root_readdir, + .iterate = proc_root_readdir, .llseek = default_llseek, }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3e636d8..dbf61f6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + * but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + * these flag is set to denote, that user is aware of the + * new API and those page-shift bits change their meaning. + * The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + * of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_LAST, +}; + +struct clear_refs_private { + struct vm_area_struct *vma; + enum clear_refs_types type; +}; + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(ptent)) continue; + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - int type; + enum clear_refs_types type; + int itype; int rv; memset(buffer, 0, sizeof(buffer)); @@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &type); + rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning! " + "See the linux/Documentation/vm/pagemap.txt for details.\n"); + } + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + .type = type, + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_start(mm, 0, -1); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* @@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -794,6 +861,7 @@ typedef struct { struct pagemapread { int pos, len; pagemap_entry_t *buffer; + bool v2; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -807,14 +875,17 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 val) @@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); for (addr = start; addr < end; addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); @@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame, flags; struct page *page = NULL; + int flags2 = 0; if (pte_present(pte)) { frame = pte_pfn(pte); @@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { /* * Currently pmd for thp is always present because thp can not be @@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, - pmd_t pmd, int offset) +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, + pmd_t pmd, int offset, int pmd_flags2) { } #endif @@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + int pmd_flags2; + + pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT); + pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* check that 'vma' actually covers this address, @@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, vma, addr, *pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pte_t pte, int offset) { if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* This function walks within one hugetlb entry in the single call */ @@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; + pm.v2 = soft_dirty_cleared; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1110,9 +1188,18 @@ out: return ret; } +static int pagemap_open(struct inode *inode, struct file *file) +{ + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, + .open = pagemap_open, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 9610ac7..0618946 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) for_each_possible_cpu(i) idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; - do_posix_clock_monotonic_gettime(&uptime); - monotonic_to_bootbased(&uptime); + get_monotonic_boottime(&uptime); nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 17f7e08..2850317 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> #include <asm/io.h> #include "internal.h" @@ -32,6 +33,10 @@ static LIST_HEAD(vmcore_list); /* Stores the pointer to the buffer containing kernel elf core headers. */ static char *elfcorebuf; static size_t elfcorebuf_sz; +static size_t elfcorebuf_sz_orig; + +static char *elfnotes_buf; +static size_t elfnotes_sz; /* Total size of vmcore file. */ static u64 vmcore_size; @@ -118,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count, return read; } -/* Maps vmcore file offset to respective physical address in memroy. */ -static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list, - struct vmcore **m_ptr) -{ - struct vmcore *m; - u64 paddr; - - list_for_each_entry(m, vc_list, list) { - u64 start, end; - start = m->offset; - end = m->offset + m->size - 1; - if (offset >= start && offset <= end) { - paddr = m->paddr + offset - start; - *m_ptr = m; - return paddr; - } - } - *m_ptr = NULL; - return 0; -} - /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -147,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, { ssize_t acc = 0, tmp; size_t tsz; - u64 start, nr_bytes; - struct vmcore *curr_m = NULL; + u64 start; + struct vmcore *m = NULL; if (buflen == 0 || *fpos >= vmcore_size) return 0; @@ -159,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = elfcorebuf_sz - *fpos; - if (buflen < tsz) - tsz = buflen; + tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) return -EFAULT; buflen -= tsz; @@ -174,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } - start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); - if (!curr_m) - return -EINVAL; - - while (buflen) { - tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK)); + /* Read Elf note segment */ + if (*fpos < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; - - tmp = read_from_oldmem(buffer, tsz, &start, 1); - if (tmp < 0) - return tmp; + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + if (copy_to_user(buffer, kaddr, tsz)) + return -EFAULT; buflen -= tsz; *fpos += tsz; buffer += tsz; acc += tsz; - if (start >= (curr_m->paddr + curr_m->size)) { - if (curr_m->list.next == &vmcore_list) - return acc; /*EOF*/ - curr_m = list_entry(curr_m->list.next, - struct vmcore, list); - start = curr_m->paddr; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (*fpos < m->offset + m->size) { + tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); + start = m->paddr + *fpos - m->offset; + tmp = read_from_oldmem(buffer, tsz, &start, 1); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; } } + return acc; } +/** + * alloc_elfnotes_buf - allocate buffer for ELF note segment in + * vmalloc memory + * + * @notes_sz: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *alloc_elfnotes_buf(size_t notes_sz) +{ +#ifdef CONFIG_MMU + return vmalloc_user(notes_sz); +#else + return vzalloc(notes_sz); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + u64 start, end, len, tsz; + struct vmcore *m; + + start = (u64)vma->vm_pgoff << PAGE_SHIFT; + end = start + size; + + if (size > vmcore_size || end > vmcore_size) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vma->vm_flags |= VM_MIXEDMAP; + + len = 0; + + if (start < elfcorebuf_sz) { + u64 pfn; + + tsz = min(elfcorebuf_sz - (size_t)start, size); + pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT; + if (remap_pfn_range(vma, vma->vm_start, pfn, tsz, + vma->vm_page_prot)) + return -EAGAIN; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + if (start < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); + kaddr = elfnotes_buf + start - elfcorebuf_sz; + if (remap_vmalloc_range_partial(vma, vma->vm_start + len, + kaddr, tsz)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (start < m->offset + m->size) { + u64 paddr = 0; + + tsz = min_t(size_t, m->offset + m->size - start, size); + paddr = m->paddr + start - m->offset; + if (remap_pfn_range(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + } + + return 0; +fail: + do_munmap(vma->vm_mm, vma->vm_start, len); + return -EAGAIN; +} +#else +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + return -ENOSYS; +} +#endif + static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, .llseek = default_llseek, + .mmap = mmap_vmcore, }; static struct vmcore* __init get_new_element(void) @@ -214,61 +318,40 @@ static struct vmcore* __init get_new_element(void) return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } -static u64 __init get_vmcore_size_elf64(char *elfptr) +static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { - int i; - u64 size; - Elf64_Ehdr *ehdr_ptr; - Elf64_Phdr *phdr_ptr; - - ehdr_ptr = (Elf64_Ehdr *)elfptr; - phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); - size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr)); - for (i = 0; i < ehdr_ptr->e_phnum; i++) { - size += phdr_ptr->p_memsz; - phdr_ptr++; - } - return size; -} - -static u64 __init get_vmcore_size_elf32(char *elfptr) -{ - int i; u64 size; - Elf32_Ehdr *ehdr_ptr; - Elf32_Phdr *phdr_ptr; + struct vmcore *m; - ehdr_ptr = (Elf32_Ehdr *)elfptr; - phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); - size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); - for (i = 0; i < ehdr_ptr->e_phnum; i++) { - size += phdr_ptr->p_memsz; - phdr_ptr++; + size = elfsz + elfnotesegsz; + list_for_each_entry(m, vc_list, list) { + size += m->size; } return size; } -/* Merges all the PT_NOTE headers into one. */ -static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, - struct list_head *vc_list) +/** + * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) { - int i, nr_ptnote=0, rc=0; - char *tmp; - Elf64_Ehdr *ehdr_ptr; - Elf64_Phdr phdr, *phdr_ptr; + int i, rc=0; + Elf64_Phdr *phdr_ptr; Elf64_Nhdr *nhdr_ptr; - u64 phdr_sz = 0, note_off; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { - int j; void *notes_section; - struct vmcore *new; u64 offset, max_sz, sz, real_sz = 0; if (phdr_ptr->p_type != PT_NOTE) continue; - nr_ptnote++; max_sz = phdr_ptr->p_memsz; offset = phdr_ptr->p_offset; notes_section = kmalloc(max_sz, GFP_KERNEL); @@ -280,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; } nhdr_ptr = notes_section; - for (j = 0; j < max_sz; j += sz) { + while (real_sz < max_sz) { if (nhdr_ptr->n_namesz == 0) break; sz = sizeof(Elf64_Nhdr) + @@ -289,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, real_sz += sz; nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); } - - /* Add this contiguous chunk of notes section to vmcore list.*/ - new = get_new_element(); - if (!new) { - kfree(notes_section); - return -ENOMEM; - } - new->paddr = phdr_ptr->p_offset; - new->size = real_sz; - list_add_tail(&new->list, vc_list); - phdr_sz += real_sz; kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + } + + return 0; +} + +/** + * get_note_number_and_size_elf64 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf64_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf64 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf64_Phdr *phdr_ptr; + + phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; } + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + rc = update_note_header_size_elf64(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = alloc_elfnotes_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf64(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + /* Prepare merged PT_NOTE program header. */ phdr.p_type = PT_NOTE; phdr.p_flags = 0; note_off = sizeof(Elf64_Ehdr) + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); - phdr.p_offset = note_off; + phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; phdr.p_align = 0; @@ -322,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); *elfsz = *elfsz - i; memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; @@ -329,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return 0; } -/* Merges all the PT_NOTE headers into one. */ -static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, - struct list_head *vc_list) +/** + * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) { - int i, nr_ptnote=0, rc=0; - char *tmp; - Elf32_Ehdr *ehdr_ptr; - Elf32_Phdr phdr, *phdr_ptr; + int i, rc=0; + Elf32_Phdr *phdr_ptr; Elf32_Nhdr *nhdr_ptr; - u64 phdr_sz = 0, note_off; - ehdr_ptr = (Elf32_Ehdr *)elfptr; - phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { - int j; void *notes_section; - struct vmcore *new; u64 offset, max_sz, sz, real_sz = 0; if (phdr_ptr->p_type != PT_NOTE) continue; - nr_ptnote++; max_sz = phdr_ptr->p_memsz; offset = phdr_ptr->p_offset; notes_section = kmalloc(max_sz, GFP_KERNEL); @@ -361,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; } nhdr_ptr = notes_section; - for (j = 0; j < max_sz; j += sz) { + while (real_sz < max_sz) { if (nhdr_ptr->n_namesz == 0) break; sz = sizeof(Elf32_Nhdr) + @@ -370,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, real_sz += sz; nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); } - - /* Add this contiguous chunk of notes section to vmcore list.*/ - new = get_new_element(); - if (!new) { - kfree(notes_section); - return -ENOMEM; - } - new->paddr = phdr_ptr->p_offset; - new->size = real_sz; - list_add_tail(&new->list, vc_list); - phdr_sz += real_sz; kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + } + + return 0; +} + +/** + * get_note_number_and_size_elf32 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf32_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf32 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf32_Phdr *phdr_ptr; + + phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; } + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + rc = update_note_header_size_elf32(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = alloc_elfnotes_buf(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf32(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + /* Prepare merged PT_NOTE program header. */ phdr.p_type = PT_NOTE; phdr.p_flags = 0; note_off = sizeof(Elf32_Ehdr) + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); - phdr.p_offset = note_off; + phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; phdr.p_align = 0; @@ -403,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); *elfsz = *elfsz - i; memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; @@ -414,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, * the new offset fields of exported program headers. */ static int __init process_ptload_program_headers_elf64(char *elfptr, size_t elfsz, + size_t elfnotes_sz, struct list_head *vc_list) { int i; @@ -425,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ - /* First program header is PT_NOTE header. */ - vmcore_off = sizeof(Elf64_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) + - phdr_ptr->p_memsz; /* Note sections */ + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + if (phdr_ptr->p_type != PT_LOAD) continue; + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + /* Add this contiguous chunk of memory to vmcore list.*/ new = get_new_element(); if (!new) return -ENOMEM; - new->paddr = phdr_ptr->p_offset; - new->size = phdr_ptr->p_memsz; + new->paddr = start; + new->size = size; list_add_tail(&new->list, vc_list); /* Update the program header offset. */ - phdr_ptr->p_offset = vmcore_off; - vmcore_off = vmcore_off + phdr_ptr->p_memsz; + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; } return 0; } static int __init process_ptload_program_headers_elf32(char *elfptr, size_t elfsz, + size_t elfnotes_sz, struct list_head *vc_list) { int i; @@ -462,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ - /* First program header is PT_NOTE header. */ - vmcore_off = sizeof(Elf32_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) + - phdr_ptr->p_memsz; /* Note sections */ + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + if (phdr_ptr->p_type != PT_LOAD) continue; + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + /* Add this contiguous chunk of memory to vmcore list.*/ new = get_new_element(); if (!new) return -ENOMEM; - new->paddr = phdr_ptr->p_offset; - new->size = phdr_ptr->p_memsz; + new->paddr = start; + new->size = size; list_add_tail(&new->list, vc_list); /* Update the program header offset */ - phdr_ptr->p_offset = vmcore_off; - vmcore_off = vmcore_off + phdr_ptr->p_memsz; + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; } return 0; } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets_elf64(char *elfptr, - struct list_head *vc_list) +static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) { loff_t vmcore_off; - Elf64_Ehdr *ehdr_ptr; struct vmcore *m; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - - /* Skip Elf header and program headers. */ - vmcore_off = sizeof(Elf64_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr); + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; list_for_each_entry(m, vc_list, list) { m->offset = vmcore_off; @@ -506,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr, } } -/* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets_elf32(char *elfptr, - struct list_head *vc_list) +static void free_elfcorebuf(void) { - loff_t vmcore_off; - Elf32_Ehdr *ehdr_ptr; - struct vmcore *m; - - ehdr_ptr = (Elf32_Ehdr *)elfptr; - - /* Skip Elf header and program headers. */ - vmcore_off = sizeof(Elf32_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr); - - list_for_each_entry(m, vc_list, list) { - m->offset = vmcore_off; - vmcore_off += m->size; - } + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = NULL; + vfree(elfnotes_buf); + elfnotes_buf = NULL; } static int __init parse_crash_elf64_headers(void) @@ -554,31 +829,32 @@ static int __init parse_crash_elf64_headers(void) } /* Read in all elf headers. */ - elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); - elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) + + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); - if (rc < 0) { - kfree(elfcorebuf); - return rc; - } + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + if (rc < 0) + goto fail; /* Merge all PT_NOTE headers into one. */ - rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, - &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } - set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list); + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); return 0; +fail: + free_elfcorebuf(); + return rc; } static int __init parse_crash_elf32_headers(void) @@ -609,31 +885,31 @@ static int __init parse_crash_elf32_headers(void) } /* Read in all elf headers. */ - elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); - elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); - if (rc < 0) { - kfree(elfcorebuf); - return rc; - } + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + if (rc < 0) + goto fail; /* Merge all PT_NOTE headers into one. */ - rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); + if (rc) + goto fail; rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, - &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } - set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list); + elfnotes_sz, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); return 0; +fail: + free_elfcorebuf(); + return rc; } static int __init parse_crash_elf_headers(void) @@ -655,20 +931,19 @@ static int __init parse_crash_elf_headers(void) rc = parse_crash_elf64_headers(); if (rc) return rc; - - /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf64(elfcorebuf); } else if (e_ident[EI_CLASS] == ELFCLASS32) { rc = parse_crash_elf32_headers(); if (rc) return rc; - - /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf32(elfcorebuf); } else { pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + return 0; } @@ -711,7 +986,6 @@ void vmcore_cleanup(void) list_del(&m->list); kfree(m); } - kfree(elfcorebuf); - elfcorebuf = NULL; + free_elfcorebuf(); } EXPORT_SYMBOL_GPL(vmcore_cleanup); diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 43b1280..76a4eeb 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, rec.parent_ip = parent_ip; pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, - sizeof(rec), psinfo); + 0, sizeof(rec), psinfo); local_irq_restore(flags); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index e4bcb2c..71bf5f4 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) if (p->psi->erase) p->psi->erase(p->type, p->id, p->count, dentry->d_inode->i_ctime, p->psi); + else + return -EPERM; return simple_unlink(dir, dentry); } @@ -324,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_MCE: sprintf(name, "mce-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_RTAS: + sprintf(name, "rtas-%s-%lld", psname, id); + break; + case PSTORE_TYPE_PPC_OF: + sprintf(name, "powerpc-ofw-%s-%lld", psname, id); + break; + case PSTORE_TYPE_PPC_COMMON: + sprintf(name, "powerpc-common-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: sprintf(name, "unknown-%s-%lld", psname, id); break; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 86d1038..422962a 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, break; ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - oopscount, hsize + len, psinfo); + oopscount, hsize, hsize + len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; @@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) spin_lock_irqsave(&psinfo->buf_lock, flags); } memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo); + psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; @@ -221,9 +221,11 @@ static void pstore_register_console(void) {} static int pstore_write_compat(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, int count, - size_t size, struct pstore_info *psi) + size_t hsize, size_t size, + struct pstore_info *psi) { - return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); + return psi->write_buf(type, reason, id, part, psinfo->buf, hsize, + size, psi); } /* @@ -239,17 +241,15 @@ int pstore_register(struct pstore_info *psi) { struct module *owner = psi->owner; + if (backend && strcmp(backend, psi->name)) + return -EPERM; + spin_lock(&pstore_lock); if (psinfo) { spin_unlock(&pstore_lock); return -EBUSY; } - if (backend && strcmp(backend, psi->name)) { - spin_unlock(&pstore_lock); - return -EINVAL; - } - if (!psi->write) psi->write = pstore_write_compat; psinfo = psi; @@ -274,6 +274,9 @@ int pstore_register(struct pstore_info *psi) add_timer(&pstore_timer); } + pr_info("pstore: Registered %s as persistent store backend\n", + psi->name); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 1376e5a..a6119f9 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -195,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, enum kmsg_dump_reason reason, u64 *id, unsigned int part, - const char *buf, size_t size, + const char *buf, + size_t hsize, size_t size, struct pstore_info *psi) { struct ramoops_context *cxt = psi->data; @@ -399,8 +400,6 @@ static int ramoops_probe(struct platform_device *pdev) goto fail_out; } - if (!is_power_of_2(pdata->mem_size)) - pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); if (!is_power_of_2(pdata->record_size)) pdata->record_size = rounddown_pow_of_two(pdata->record_size); if (!is_power_of_2(pdata->console_size)) diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 5933732..de272d4 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz) } /* increase and wrap the start pointer, returning the old value */ -static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) +static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a) { int old; int new; @@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) } /* increase the size counter until it hits the max size */ -static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a) +static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a) { size_t old; size_t new; @@ -78,6 +78,53 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a) } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); } +static DEFINE_RAW_SPINLOCK(buffer_lock); + +/* increase and wrap the start pointer, returning the old value */ +static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) +{ + int old; + int new; + unsigned long flags; + + raw_spin_lock_irqsave(&buffer_lock, flags); + + old = atomic_read(&prz->buffer->start); + new = old + a; + while (unlikely(new > prz->buffer_size)) + new -= prz->buffer_size; + atomic_set(&prz->buffer->start, new); + + raw_spin_unlock_irqrestore(&buffer_lock, flags); + + return old; +} + +/* increase the size counter until it hits the max size */ +static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a) +{ + size_t old; + size_t new; + unsigned long flags; + + raw_spin_lock_irqsave(&buffer_lock, flags); + + old = atomic_read(&prz->buffer->size); + if (old == prz->buffer_size) + goto exit; + + new = old + a; + if (new > prz->buffer_size) + new = prz->buffer_size; + atomic_set(&prz->buffer->size, new); + +exit: + raw_spin_unlock_irqrestore(&buffer_lock, flags); +} + +static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic; +static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic; + static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, uint8_t *data, size_t len, uint8_t *ecc) { @@ -372,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size) return NULL; } + buffer_start_add = buffer_start_add_locked; + buffer_size_add = buffer_size_add_locked; + return ioremap(start, size); } diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index 28ce014..b218f96 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -14,9 +14,9 @@ #include <linux/buffer_head.h> #include "qnx4.h" -static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int qnx4_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); unsigned int offset; struct buffer_head *bh; struct qnx4_inode_entry *de; @@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) int size; QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); - QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); + QNX4DEBUG((KERN_INFO "pos = %ld\n", (long) ctx->pos)); - while (filp->f_pos < inode->i_size) { - blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); + while (ctx->pos < inode->i_size) { + blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS); bh = sb_bread(inode->i_sb, blknum); - if(bh==NULL) { + if (bh == NULL) { printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum); - break; + return 0; } - ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; - while (ix < QNX4_INODES_PER_BLOCK) { + ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; + for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) { offset = ix * QNX4_DIR_ENTRY_SIZE; de = (struct qnx4_inode_entry *) (bh->b_data + offset); - size = strlen(de->di_fname); - if (size) { - if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX ) - size = QNX4_SHORT_NAME_MAX; - else if ( size > QNX4_NAME_MAX ) - size = QNX4_NAME_MAX; - - if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) { - QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); - if ( ( de->di_status & QNX4_FILE_LINK ) == 0 ) - ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; - else { - le = (struct qnx4_link_info*)de; - ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * - QNX4_INODES_PER_BLOCK + - le->dl_inode_ndx; - } - if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) { - brelse(bh); - goto out; - } - } + if (!de->di_fname[0]) + continue; + if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) + continue; + if (!(de->di_status & QNX4_FILE_LINK)) + size = QNX4_SHORT_NAME_MAX; + else + size = QNX4_NAME_MAX; + size = strnlen(de->di_fname, size); + QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); + if (!(de->di_status & QNX4_FILE_LINK)) + ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; + else { + le = (struct qnx4_link_info*)de; + ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * + QNX4_INODES_PER_BLOCK + + le->dl_inode_ndx; + } + if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) { + brelse(bh); + return 0; } - ix++; - filp->f_pos += QNX4_DIR_ENTRY_SIZE; } brelse(bh); } -out: return 0; } @@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = qnx4_readdir, + .iterate = qnx4_readdir, .fsync = generic_file_fsync, }; diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8798d06..15b7d92 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, static int qnx6_dir_longfilename(struct inode *inode, struct qnx6_long_dir_entry *de, - void *dirent, loff_t pos, - unsigned de_inode, filldir_t filldir) + struct dir_context *ctx, + unsigned de_inode) { struct qnx6_long_filename *lf; struct super_block *s = inode->i_sb; @@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode, QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", lf_size, lf->lf_fname, de_inode)); - if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode, - DT_UNKNOWN) < 0) { + if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { qnx6_put_page(page); return 0; } @@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode, return 1; } -static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int qnx6_readdir(struct file *file, struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); + loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; bool done = false; - if (filp->f_pos >= inode->i_size) + ctx->pos = pos; + if (ctx->pos >= inode->i_size) return 0; for ( ; !done && n < npages; n++, start = 0) { @@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) if (IS_ERR(page)) { printk(KERN_ERR "qnx6_readdir: read failed\n"); - filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT; + ctx->pos = (n + 1) << PAGE_CACHE_SHIFT; return PTR_ERR(page); } de = ((struct qnx6_dir_entry *)page_address(page)) + start; - for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) { + for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { int size = de->de_size; u32 no_inode = fs32_to_cpu(sbi, de->de_inode); @@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) structure / block */ if (!qnx6_dir_longfilename(inode, (struct qnx6_long_dir_entry *)de, - dirent, pos, no_inode, - filldir)) { + ctx, no_inode)) { done = true; break; } @@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" " inode:%u\n", size, de->de_fname, no_inode)); - if (filldir(dirent, de->de_fname, size, - pos, no_inode, DT_UNKNOWN) - < 0) { + if (!dir_emit(ctx, de->de_fname, size, + no_inode, DT_UNKNOWN)) { done = true; break; } @@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) } qnx6_put_page(page); } - filp->f_pos = pos; return 0; } @@ -282,7 +279,7 @@ found: const struct file_operations qnx6_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = qnx6_readdir, + .iterate = qnx6_readdir, .fsync = generic_file_fsync, }; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3e64169..fbad622 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write, return proc_dointvec(table, write, buffer, lenp, ppos); } -static ctl_table fs_dqstats_table[] = { +static struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], @@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = { { }, }; -static ctl_table fs_table[] = { +static struct ctl_table fs_table[] = { { .procname = "quota", .mode = 0555, @@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = { { }, }; -static ctl_table sys_table[] = { +static struct ctl_table sys_table[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/read_write.c b/fs/read_write.c index 0343000..122a384 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -41,8 +41,19 @@ static inline int unsigned_offsets(struct file *file) return file->f_mode & FMODE_UNSIGNED_OFFSET; } -static loff_t lseek_execute(struct file *file, struct inode *inode, - loff_t offset, loff_t maxsize) +/** + * vfs_setpos - update the file offset for lseek + * @file: file structure in question + * @offset: file offset to seek to + * @maxsize: maximum file size + * + * This is a low-level filesystem helper for updating the file offset to + * the value specified by @offset if the given offset is valid and it is + * not equal to the current file offset. + * + * Return the specified offset on success and -EINVAL on invalid offset. + */ +loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; @@ -55,6 +66,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode, } return offset; } +EXPORT_SYMBOL(vfs_setpos); /** * generic_file_llseek_size - generic llseek implementation for regular files @@ -76,8 +88,6 @@ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { - struct inode *inode = file->f_mapping->host; - switch (whence) { case SEEK_END: offset += eof; @@ -97,8 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * like SEEK_SET. */ spin_lock(&file->f_lock); - offset = lseek_execute(file, inode, file->f_pos + offset, - maxsize); + offset = vfs_setpos(file, file->f_pos + offset, maxsize); spin_unlock(&file->f_lock); return offset; case SEEK_DATA: @@ -120,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, break; } - return lseek_execute(file, inode, offset, maxsize); + return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); @@ -145,6 +154,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) EXPORT_SYMBOL(generic_file_llseek); /** + * fixed_size_llseek - llseek implementation for fixed-sized devices + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @size: size of the file + * + */ +loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) +{ + switch (whence) { + case SEEK_SET: case SEEK_CUR: case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + size, size); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(fixed_size_llseek); + +/** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to @@ -296,7 +325,7 @@ out_putf: * them to something that fits in "int" so that others * won't have to do range checks all the time. */ -int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) +int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { struct inode *inode; loff_t pos; @@ -477,7 +506,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_read(f.file, buf, count, &pos); - file_pos_write(f.file, pos); + if (ret >= 0) + file_pos_write(f.file, pos); fdput(f); } return ret; @@ -492,7 +522,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_write(f.file, buf, count, &pos); - file_pos_write(f.file, pos); + if (ret >= 0) + file_pos_write(f.file, pos); fdput(f); } @@ -780,7 +811,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_readv(f.file, vec, vlen, &pos); - file_pos_write(f.file, pos); + if (ret >= 0) + file_pos_write(f.file, pos); fdput(f); } @@ -799,7 +831,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_writev(f.file, vec, vlen, &pos); - file_pos_write(f.file, pos); + if (ret >= 0) + file_pos_write(f.file, pos); fdput(f); } @@ -959,7 +992,8 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, return -EBADF; pos = f.file->f_pos; ret = compat_readv(f.file, vec, vlen, &pos); - f.file->f_pos = pos; + if (ret >= 0) + f.file->f_pos = pos; fdput(f); return ret; } @@ -1025,7 +1059,8 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, return -EBADF; pos = f.file->f_pos; ret = compat_writev(f.file, vec, vlen, &pos); - f.file->f_pos = pos; + if (ret >= 0) + f.file->f_pos = pos; fdput(f); return ret; } @@ -1064,6 +1099,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, struct fd in, out; struct inode *in_inode, *out_inode; loff_t pos; + loff_t out_pos; ssize_t retval; int fl; @@ -1077,12 +1113,14 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; - if (!ppos) - ppos = &in.file->f_pos; - else + if (!ppos) { + pos = in.file->f_pos; + } else { + pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; - retval = rw_verify_area(READ, in.file, ppos, count); + } + retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; count = retval; @@ -1099,7 +1137,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = -EINVAL; in_inode = file_inode(in.file); out_inode = file_inode(out.file); - retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); + out_pos = out.file->f_pos; + retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; count = retval; @@ -1107,7 +1146,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); - pos = *ppos; if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) @@ -1126,18 +1164,25 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif - retval = do_splice_direct(in.file, ppos, out.file, count, fl); + file_start_write(out.file); + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); + file_end_write(out.file); if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); + out.file->f_pos = out_pos; + if (ppos) + *ppos = pos; + else + in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); - if (*ppos > max) + if (pos > max) retval = -EOVERFLOW; fput_out: diff --git a/fs/readdir.c b/fs/readdir.c index fee38e0..93d71e5 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -20,11 +20,11 @@ #include <asm/uaccess.h> -int vfs_readdir(struct file *file, filldir_t filler, void *buf) +int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); int res = -ENOTDIR; - if (!file->f_op || !file->f_op->readdir) + if (!file->f_op || !file->f_op->iterate) goto out; res = security_file_permission(file, MAY_READ); @@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf) res = -ENOENT; if (!IS_DEADDIR(inode)) { - res = file->f_op->readdir(file, buf, filler); + ctx->pos = file->f_pos; + res = file->f_op->iterate(file, ctx); + file->f_pos = ctx->pos; file_accessed(file); } mutex_unlock(&inode->i_mutex); out: return res; } - -EXPORT_SYMBOL(vfs_readdir); +EXPORT_SYMBOL(iterate_dir); /* * Traditional linux readdir() handling.. @@ -66,6 +67,7 @@ struct old_linux_dirent { }; struct readdir_callback { + struct dir_context ctx; struct old_linux_dirent __user * dirent; int result; }; @@ -73,7 +75,7 @@ struct readdir_callback { static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct readdir_callback * buf = (struct readdir_callback *) __buf; + struct readdir_callback *buf = (struct readdir_callback *) __buf; struct old_linux_dirent __user * dirent; unsigned long d_ino; @@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, { int error; struct fd f = fdget(fd); - struct readdir_callback buf; + struct readdir_callback buf = { + .ctx.actor = fillonedir, + .dirent = dirent + }; if (!f.file) return -EBADF; - buf.result = 0; - buf.dirent = dirent; - - error = vfs_readdir(f.file, fillonedir, &buf); + error = iterate_dir(f.file, &buf.ctx); if (buf.result) error = buf.result; @@ -137,6 +139,7 @@ struct linux_dirent { }; struct getdents_callback { + struct dir_context ctx; struct linux_dirent __user * current_dir; struct linux_dirent __user * previous; int count; @@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, { struct fd f; struct linux_dirent __user * lastdirent; - struct getdents_callback buf; + struct getdents_callback buf = { + .ctx.actor = filldir, + .count = count, + .current_dir = dirent + }; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) @@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, if (!f.file) return -EBADF; - buf.current_dir = dirent; - buf.previous = NULL; - buf.count = count; - buf.error = 0; - - error = vfs_readdir(f.file, filldir, &buf); + error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - if (put_user(f.file->f_pos, &lastdirent->d_off)) + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; @@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, } struct getdents_callback64 { + struct dir_context ctx; struct linux_dirent64 __user * current_dir; struct linux_dirent64 __user * previous; int count; @@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, { struct fd f; struct linux_dirent64 __user * lastdirent; - struct getdents_callback64 buf; + struct getdents_callback64 buf = { + .ctx.actor = filldir64, + .count = count, + .current_dir = dirent + }; int error; if (!access_ok(VERIFY_WRITE, dirent, count)) @@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, if (!f.file) return -EBADF; - buf.current_dir = dirent; - buf.previous = NULL; - buf.count = count; - buf.error = 0; - - error = vfs_readdir(f.file, filldir64, &buf); + error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; lastdirent = buf.previous; if (lastdirent) { - typeof(lastdirent->d_off) d_off = f.file->f_pos; + typeof(lastdirent->d_off) d_off = buf.ctx.pos; if (__put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 66c53b6..03e4ca5 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -13,14 +13,14 @@ extern const struct reiserfs_key MIN_KEY; -static int reiserfs_readdir(struct file *, void *, filldir_t); +static int reiserfs_readdir(struct file *, struct dir_context *); static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, int datasync); const struct file_operations reiserfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = reiserfs_readdir, + .iterate = reiserfs_readdir, .fsync = reiserfs_dir_fsync, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT @@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, #define store_ih(where,what) copy_item_head (where, what) -static inline bool is_privroot_deh(struct dentry *dir, - struct reiserfs_de_head *deh) +static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh) { - struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; - return (dir == dir->d_parent && privroot->d_inode && + struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root; + return (privroot->d_inode && deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); } -int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, - filldir_t filldir, loff_t *pos) +int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) { - struct inode *inode = dentry->d_inode; struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ INITIALIZE_PATH(path_to_entry); struct buffer_head *bh; @@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, /* form key for search the next directory entry using f_pos field of file structure */ - make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); + make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); next_pos = cpu_key_k_offset(&pos_key); path_to_entry.reada = PATH_READA; @@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, entry_num++, deh++) { int d_reclen; char *d_name; - off_t d_off; ino_t d_ino; if (!de_visible(deh)) @@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, } /* Ignore the .reiserfs_priv entry */ - if (is_privroot_deh(dentry, deh)) + if (is_privroot_deh(inode, deh)) continue; - d_off = deh_offset(deh); - *pos = d_off; + ctx->pos = deh_offset(deh); d_ino = deh_objectid(deh); if (d_reclen <= 32) { local_buf = small_buf; @@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, * the write lock here for other waiters */ reiserfs_write_unlock(inode->i_sb); - if (filldir - (dirent, local_buf, d_reclen, d_off, d_ino, - DT_UNKNOWN) < 0) { + if (!dir_emit + (ctx, local_buf, d_reclen, d_ino, + DT_UNKNOWN)) { reiserfs_write_lock(inode->i_sb); if (local_buf != small_buf) { kfree(local_buf); @@ -204,6 +199,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, next_pos = deh_offset(deh) + 1; if (item_moved(&tmp_ih, &path_to_entry)) { + set_cpu_key_k_offset(&pos_key, + next_pos); goto research; } } /* for */ @@ -235,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, } /* while */ end: - *pos = next_pos; + ctx->pos = next_pos; pathrelse(&path_to_entry); reiserfs_check_path(&path_to_entry); out: @@ -243,10 +240,9 @@ out: return ret; } -static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int reiserfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = file->f_path.dentry; - return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos); + return reiserfs_readdir_inode(file_inode(file), ctx); } /* compose directory item containing "." and ".." entries (entries are diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 77d6d47..0048cc1 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1811,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - if (insert_inode_locked4(inode, args.objectid, - reiserfs_find_actor, &args) < 0) { + + reiserfs_write_unlock(inode->i_sb); + err = insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args); + reiserfs_write_lock(inode->i_sb); + if (err) { err = -EINVAL; goto out_bad_inode; } + if (old_format_only(sb)) /* not a perfect generation count, as object ids can be reused, but ** this is as good as reiserfs can do right now. @@ -2970,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) } /* clm -- taken from fs/buffer.c:block_invalidate_page */ -static void reiserfs_invalidatepage(struct page *page, unsigned long offset) +static void reiserfs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; struct inode *inode = page->mapping->host; unsigned int curr_off = 0; + unsigned int stop = offset + length; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int ret = 1; BUG_ON(!PageLocked(page)); - if (offset == 0) + if (!partial_page) ClearPageChecked(page); if (!page_has_buffers(page)) @@ -2991,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset) unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + goto out; + /* * is this block fully invalidated? */ @@ -3009,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset) * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ - if (!offset && ret) { + if (!partial_page && ret) { ret = try_to_release_page(page, 0); /* maybe should BUG_ON(!ret); - neilb */ } diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 157e474..3df5ce6 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations; extern const struct inode_operations reiserfs_symlink_inode_operations; extern const struct inode_operations reiserfs_special_inode_operations; extern const struct file_operations reiserfs_dir_operations; -int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *); +int reiserfs_readdir_inode(struct inode *, struct dir_context *); /* tail_conversion.c */ int direct2indirect(struct reiserfs_transaction_handle *, struct inode *, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4cce1d9..c69cdd7 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) * modifying extended attributes. This includes operations such as permissions * or ownership changes, object deletions, etc. */ struct reiserfs_dentry_buf { + struct dir_context ctx; struct dentry *xadir; int count; struct dentry *dentries[8]; @@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode, { struct dentry *dir; int i, err = 0; - loff_t pos = 0; struct reiserfs_dentry_buf buf = { - .count = 0, + .ctx.actor = fill_with_dentries, }; /* Skip out, an xattr has no xattrs associated with it */ @@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode, reiserfs_write_lock(inode->i_sb); buf.xadir = dir; - err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); - while ((err == 0 || err == -ENOSPC) && buf.count) { - err = 0; - - for (i = 0; i < buf.count && buf.dentries[i]; i++) { - int lerr = 0; + while (1) { + err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); + if (err) + break; + if (!buf.count) + break; + for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { struct dentry *dentry = buf.dentries[i]; - if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode)) - lerr = action(dentry, data); + if (!S_ISDIR(dentry->d_inode->i_mode)) + err = action(dentry, data); dput(dentry); buf.dentries[i] = NULL; - err = lerr ?: err; } + if (err) + break; buf.count = 0; - if (!err) - err = reiserfs_readdir_dentry(dir, &buf, - fill_with_dentries, &pos); } mutex_unlock(&dir->d_inode->i_mutex); - /* Clean up after a failed readdir */ cleanup_dentry_buf(&buf); if (!err) { @@ -318,7 +316,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data) static int chown_one_xattr(struct dentry *dentry, void *data) { struct iattr *attrs = data; - return reiserfs_setattr(dentry, attrs); + int ia_valid = attrs->ia_valid; + int err; + + /* + * We only want the ownership bits. Otherwise, we'll do + * things like change a directory to a regular file if + * ATTR_MODE is set. + */ + attrs->ia_valid &= (ATTR_UID|ATTR_GID); + err = reiserfs_setattr(dentry, attrs); + attrs->ia_valid = ia_valid; + + return err; } /* No i_mutex, but the inode is unconnected. */ @@ -788,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name) } struct listxattr_buf { + struct dir_context ctx; size_t size; size_t pos; char *buf; @@ -833,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) { struct dentry *dir; int err = 0; - loff_t pos = 0; struct listxattr_buf buf = { + .ctx.actor = listxattr_filler, .dentry = dentry, .buf = buffer, .size = buffer ? size : 0, @@ -856,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) } mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); - err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos); + err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); mutex_unlock(&dir->d_inode->i_mutex); if (!err) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d7c01ef..6c8767f 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode) int depth; int error; + if (IS_PRIVATE(inode)) + return 0; + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 15cbc41e..ff1d3d4 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = { /* * read the entries from a directory */ -static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int romfs_readdir(struct file *file, struct dir_context *ctx) { - struct inode *i = file_inode(filp); + struct inode *i = file_inode(file); struct romfs_inode ri; unsigned long offset, maxoff; int j, ino, nextfh; - int stored = 0; char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ int ret; maxoff = romfs_maxsize(i->i_sb); - offset = filp->f_pos; + offset = ctx->pos; if (!offset) { offset = i->i_ino & ROMFH_MASK; ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); @@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) for (;;) { if (!offset || offset >= maxoff) { offset = maxoff; - filp->f_pos = offset; + ctx->pos = offset; goto out; } - filp->f_pos = offset; + ctx->pos = offset; /* Fetch inode info */ ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); @@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) nextfh = be32_to_cpu(ri.next); if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) ino = be32_to_cpu(ri.spec); - if (filldir(dirent, fsname, j, offset, ino, - romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) + if (!dir_emit(ctx, fsname, j, ino, + romfs_dtype_table[nextfh & ROMFH_TYPE])) goto out; - stored++; offset = nextfh & ROMFH_MASK; } - out: - return stored; + return 0; } /* @@ -281,7 +278,7 @@ error: static const struct file_operations romfs_dir_operations = { .read = generic_read_dir, - .readdir = romfs_readdir, + .iterate = romfs_readdir, .llseek = default_llseek, }; diff --git a/fs/select.c b/fs/select.c index 8c1c96c..f9f49c4 100644 --- a/fs/select.c +++ b/fs/select.c @@ -27,6 +27,8 @@ #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/sched/rt.h> +#include <linux/freezer.h> +#include <net/ll_poll.h> #include <asm/uaccess.h> @@ -236,7 +238,8 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, set_current_state(state); if (!pwq->triggered) - rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + rc = freezable_schedule_hrtimeout_range(expires, slack, + HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* @@ -384,9 +387,10 @@ get_max: #define POLLEX_SET (POLLPRI) static inline void wait_key_set(poll_table *wait, unsigned long in, - unsigned long out, unsigned long bit) + unsigned long out, unsigned long bit, + unsigned int ll_flag) { - wait->_key = POLLEX_SET; + wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) @@ -400,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + unsigned long busy_end = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -422,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -449,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) f_op = f.file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { - wait_key_set(wait, in, out, bit); + wait_key_set(wait, in, out, + bit, busy_flag); mask = (*f_op->poll)(f.file, wait); } fdput(f); @@ -468,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval++; wait->_qproc = NULL; } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } } if (res_in) @@ -486,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) break; } + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (can_busy_loop && !need_resched()) { + if (!busy_end) { + busy_end = busy_loop_end_time(); + continue; + } + if (!busy_loop_timeout(busy_end)) + continue; + } + busy_flag = 0; + /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to @@ -717,7 +748,9 @@ struct poll_list { * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ -static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) +static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, + bool *can_busy_poll, + unsigned int busy_flag) { unsigned int mask; int fd; @@ -731,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = DEFAULT_POLLMASK; if (f.file->f_op && f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; + pwait->_key |= busy_flag; mask = f.file->f_op->poll(f.file, pwait); + if (mask & busy_flag) + *can_busy_poll = true; } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; @@ -750,6 +786,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + unsigned long busy_end = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -762,6 +800,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; + bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -776,9 +815,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt)) { + if (do_pollfd(pfd, pt, &can_busy_loop, + busy_flag)) { count++; pt->_qproc = NULL; + /* found something, stop busy polling */ + busy_flag = 0; + can_busy_loop = false; } } } @@ -795,6 +838,17 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (count || timed_out) break; + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (can_busy_loop && !need_resched()) { + if (!busy_end) { + busy_end = busy_loop_end_time(); + continue; + } + if (!busy_loop_timeout(busy_end)) + continue; + } + busy_flag = 0; + /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to diff --git a/fs/seq_file.c b/fs/seq_file.c index 774c1eb..3135c25 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -921,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v, return rcu_dereference(node->next); } EXPORT_SYMBOL(seq_hlist_next_rcu); + +/** + * seq_hlist_start_precpu - start an iteration of a percpu hlist array + * @head: pointer to percpu array of struct hlist_heads + * @cpu: pointer to cpu "cursor" + * @pos: start position of sequence + * + * Called at seq_file->op->start(). + */ +struct hlist_node * +seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos) +{ + struct hlist_node *node; + + for_each_possible_cpu(*cpu) { + hlist_for_each(node, per_cpu_ptr(head, *cpu)) { + if (pos-- == 0) + return node; + } + } + return NULL; +} +EXPORT_SYMBOL(seq_hlist_start_percpu); + +/** + * seq_hlist_next_percpu - move to the next position of the percpu hlist array + * @v: pointer to current hlist_node + * @head: pointer to percpu array of struct hlist_heads + * @cpu: pointer to cpu "cursor" + * @pos: start position of sequence + * + * Called at seq_file->op->next(). + */ +struct hlist_node * +seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, + int *cpu, loff_t *pos) +{ + struct hlist_node *node = v; + + ++*pos; + + if (node->next) + return node->next; + + for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids; + *cpu = cpumask_next(*cpu, cpu_possible_mask)) { + struct hlist_head *bucket = per_cpu_ptr(head, *cpu); + + if (!hlist_empty(bucket)) + return bucket->first; + } + return NULL; +} +EXPORT_SYMBOL(seq_hlist_next_percpu); diff --git a/fs/splice.c b/fs/splice.c index e6b2559..3b7ee65 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1098,27 +1098,13 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); - int ret; - - if (unlikely(!(out->f_mode & FMODE_WRITE))) - return -EBADF; - - if (unlikely(out->f_flags & O_APPEND)) - return -EINVAL; - - ret = rw_verify_area(WRITE, out, ppos, len); - if (unlikely(ret < 0)) - return ret; if (out->f_op && out->f_op->splice_write) splice_write = out->f_op->splice_write; else splice_write = default_file_splice_write; - file_start_write(out); - ret = splice_write(pipe, out, ppos, len, flags); - file_end_write(out); - return ret; + return splice_write(pipe, out, ppos, len, flags); } /* @@ -1274,7 +1260,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; - return do_splice_from(pipe, file, &file->f_pos, sd->total_len, + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } @@ -1283,6 +1269,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * @in: file to splice from * @ppos: input file offset * @out: file to splice to + * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * @@ -1294,7 +1281,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags) + loff_t *opos, size_t len, unsigned int flags) { struct splice_desc sd = { .len = len, @@ -1302,9 +1289,20 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .flags = flags, .pos = *ppos, .u.file = out, + .opos = opos, }; long ret; + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = rw_verify_area(WRITE, out, opos, len); + if (unlikely(ret < 0)) + return ret; + ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) *ppos = sd.pos; @@ -1325,7 +1323,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; - loff_t offset, *off; + loff_t offset; long ret; ipipe = get_pipe_info(in); @@ -1356,13 +1354,27 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &out->f_pos; + } else { + offset = out->f_pos; + } - ret = do_splice_from(ipipe, out, off, len, flags); + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; - if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + + ret = rw_verify_area(WRITE, out, &offset, len); + if (unlikely(ret < 0)) + return ret; + + file_start_write(out); + ret = do_splice_from(ipipe, out, &offset, len, flags); + file_end_write(out); + + if (!off_out) + out->f_pos = offset; + else if (copy_to_user(off_out, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; @@ -1376,13 +1388,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &in->f_pos; + } else { + offset = in->f_pos; + } - ret = do_splice_to(in, off, opipe, len, flags); + ret = do_splice_to(in, &offset, opipe, len, flags); - if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + if (!off_in) + in->f_pos = offset; + else if (copy_to_user(off_in, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 57dc70e..f7f527b 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -100,7 +100,7 @@ static int get_dir_index_using_offset(struct super_block *sb, } -static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int squashfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; @@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) * It also means that the external f_pos is offset by 3 from the * on-disk directory f_pos. */ - while (file->f_pos < 3) { + while (ctx->pos < 3) { char *name; int i_ino; - if (file->f_pos == 0) { + if (ctx->pos == 0) { name = "."; size = 1; i_ino = inode->i_ino; @@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) i_ino = squashfs_i(inode)->parent; } - TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n", - dirent, name, size, file->f_pos, i_ino, - squashfs_filetype_table[1]); - - if (filldir(dirent, name, size, file->f_pos, i_ino, - squashfs_filetype_table[1]) < 0) { - TRACE("Filldir returned less than 0\n"); + if (!dir_emit(ctx, name, size, i_ino, + squashfs_filetype_table[1])) goto finish; - } - file->f_pos += size; + ctx->pos += size; } length = get_dir_index_using_offset(inode->i_sb, &block, &offset, squashfs_i(inode)->dir_idx_start, squashfs_i(inode)->dir_idx_offset, squashfs_i(inode)->dir_idx_cnt, - file->f_pos); + ctx->pos); while (length < i_size_read(inode)) { /* @@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) length += sizeof(*dire) + size; - if (file->f_pos >= length) + if (ctx->pos >= length) continue; dire->name[size] = '\0'; @@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) ((short) le16_to_cpu(dire->inode_number)); type = le16_to_cpu(dire->type); - TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)" - "\n", dirent, dire->name, size, - file->f_pos, - le32_to_cpu(dirh.start_block), - le16_to_cpu(dire->offset), - inode_number, - squashfs_filetype_table[type]); - - if (filldir(dirent, dire->name, size, file->f_pos, + if (!dir_emit(ctx, dire->name, size, inode_number, - squashfs_filetype_table[type]) < 0) { - TRACE("Filldir returned less than 0\n"); + squashfs_filetype_table[type])) goto finish; - } - file->f_pos = length; + ctx->pos = length; } } @@ -238,6 +222,6 @@ failed_read: const struct file_operations squashfs_dir_ops = { .read = generic_read_dir, - .readdir = squashfs_readdir, + .iterate = squashfs_readdir, .llseek = default_llseek, }; diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index e8e0e71..e068e74 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left, } /** - * sysfs_link_subling - link sysfs_dirent into sibling rbtree + * sysfs_link_sibling - link sysfs_dirent into sibling rbtree * @sd: sysfs_dirent of interest * * Link @sd into its sibling rbtree which starts from @@ -998,68 +998,38 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, return pos; } -static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int sysfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; - struct sysfs_dirent *pos = filp->private_data; + struct sysfs_dirent *pos = file->private_data; enum kobj_ns_type type; const void *ns; - ino_t ino; - loff_t off; type = sysfs_ns_type(parent_sd); ns = sysfs_info(dentry->d_sb)->ns[type]; - if (filp->f_pos == 0) { - ino = parent_sd->s_ino; - if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) - filp->f_pos++; - else - return 0; - } - if (filp->f_pos == 1) { - if (parent_sd->s_parent) - ino = parent_sd->s_parent->s_ino; - else - ino = parent_sd->s_ino; - if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) - filp->f_pos++; - else - return 0; - } + if (!dir_emit_dots(file, ctx)) + return 0; mutex_lock(&sysfs_mutex); - off = filp->f_pos; - for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); + for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos); pos; - pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { - const char * name; - unsigned int type; - int len, ret; - - name = pos->s_name; - len = strlen(name); - ino = pos->s_ino; - type = dt_type(pos); - off = filp->f_pos = pos->s_hash; - filp->private_data = sysfs_get(pos); + pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) { + const char *name = pos->s_name; + unsigned int type = dt_type(pos); + int len = strlen(name); + ino_t ino = pos->s_ino; + ctx->pos = pos->s_hash; + file->private_data = sysfs_get(pos); mutex_unlock(&sysfs_mutex); - ret = filldir(dirent, name, len, off, ino, type); + if (!dir_emit(ctx, name, len, ino, type)) + return 0; mutex_lock(&sysfs_mutex); - if (ret < 0) - break; } mutex_unlock(&sysfs_mutex); - - /* don't reference last entry if its refcount is dropped */ - if (!pos) { - filp->private_data = NULL; - - /* EOF and not changed as 0 or 1 in read/write path */ - if (off == filp->f_pos && off > 1) - filp->f_pos = INT_MAX; - } + file->private_data = NULL; + ctx->pos = INT_MAX; return 0; } @@ -1077,7 +1047,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, - .readdir = sysfs_readdir, + .iterate = sysfs_readdir, .release = sysfs_dir_release, .llseek = sysfs_dir_llseek, }; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 602f56d..d2bb7ed 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd) spin_lock_irqsave(&sysfs_open_dirent_lock, flags); - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); + if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) { + od = sd->s_attr.open; + if (od) { + atomic_inc(&od->event); + wake_up_interruptible(&od->poll); + } } spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 0ce3ccf..3e2837a 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -24,8 +24,6 @@ #include <linux/security.h> #include "sysfs.h" -extern struct super_block * sysfs_sb; - static const struct address_space_operations sysfs_aops = { .readpage = simple_readpage, .write_begin = simple_write_begin, diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 3799e8d..d42291d 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -18,12 +18,12 @@ #include <linux/swap.h> #include "sysv.h" -static int sysv_readdir(struct file *, void *, filldir_t); +static int sysv_readdir(struct file *, struct dir_context *); const struct file_operations sysv_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = sysv_readdir, + .iterate = sysv_readdir, .fsync = generic_file_fsync, }; @@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n) return page; } -static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int sysv_readdir(struct file *file, struct dir_context *ctx) { - unsigned long pos = filp->f_pos; - struct inode *inode = file_inode(filp); + unsigned long pos = ctx->pos; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - unsigned offset = pos & ~PAGE_CACHE_MASK; - unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); + unsigned offset; + unsigned long n; - pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); + ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); if (pos >= inode->i_size) - goto done; + return 0; + + offset = pos & ~PAGE_CACHE_MASK; + n = pos >> PAGE_CACHE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; @@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) kaddr = (char *)page_address(page); de = (struct sysv_dir_entry *)(kaddr+offset); limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE; - for ( ;(char*)de <= limit; de++) { + for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { char *name = de->name; - int over; if (!de->inode) continue; - offset = (char *)de - kaddr; - - over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN), - ((loff_t)n<<PAGE_CACHE_SHIFT) | offset, + if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), - DT_UNKNOWN); - if (over) { + DT_UNKNOWN)) { dir_put_page(page); - goto done; + return 0; } } dir_put_page(page); } - -done: - filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset; return 0; } diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 1c0d5f2..731b2bb 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode) return err; } -static int sysv_hash(const struct dentry *dentry, const struct inode *inode, - struct qstr *qstr) +static int sysv_hash(const struct dentry *dentry, struct qstr *qstr) { /* Truncate the name in place, avoids having to define a compare function. */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 32b644f..9293121 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -8,6 +8,7 @@ * */ +#include <linux/alarmtimer.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> @@ -26,7 +27,10 @@ #include <linux/rcupdate.h> struct timerfd_ctx { - struct hrtimer tmr; + union { + struct hrtimer tmr; + struct alarm alarm; + } t; ktime_t tintv; ktime_t moffs; wait_queue_head_t wqh; @@ -41,14 +45,19 @@ struct timerfd_ctx { static LIST_HEAD(cancel_list); static DEFINE_SPINLOCK(cancel_lock); +static inline bool isalarm(struct timerfd_ctx *ctx) +{ + return ctx->clockid == CLOCK_REALTIME_ALARM || + ctx->clockid == CLOCK_BOOTTIME_ALARM; +} + /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, * tintv.tv64 != 0) until the timer is accessed. */ -static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) +static void timerfd_triggered(struct timerfd_ctx *ctx) { - struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr); unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); @@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) ctx->ticks++; wake_up_locked(&ctx->wqh); spin_unlock_irqrestore(&ctx->wqh.lock, flags); +} +static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) +{ + struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, + t.tmr); + timerfd_triggered(ctx); return HRTIMER_NORESTART; } +static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, + ktime_t now) +{ + struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, + t.alarm); + timerfd_triggered(ctx); + return ALARMTIMER_NORESTART; +} + /* * Called when the clock was set to cancel the timers in the cancel * list. This will wake up processes waiting on these timers. The @@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx) static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { - if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && - (flags & TFD_TIMER_CANCEL_ON_SET)) { + if ((ctx->clockid == CLOCK_REALTIME || + ctx->clockid == CLOCK_REALTIME_ALARM) && + (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { if (!ctx->might_cancel) { ctx->might_cancel = true; spin_lock(&cancel_lock); @@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) { ktime_t remaining; - remaining = hrtimer_expires_remaining(&ctx->tmr); + if (isalarm(ctx)) + remaining = alarm_expires_remaining(&ctx->t.alarm); + else + remaining = hrtimer_expires_remaining(&ctx->t.tmr); + return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; } @@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ctx->expired = 0; ctx->ticks = 0; ctx->tintv = timespec_to_ktime(ktmr->it_interval); - hrtimer_init(&ctx->tmr, clockid, htmode); - hrtimer_set_expires(&ctx->tmr, texp); - ctx->tmr.function = timerfd_tmrproc; + + if (isalarm(ctx)) { + alarm_init(&ctx->t.alarm, + ctx->clockid == CLOCK_REALTIME_ALARM ? + ALARM_REALTIME : ALARM_BOOTTIME, + timerfd_alarmproc); + } else { + hrtimer_init(&ctx->t.tmr, clockid, htmode); + hrtimer_set_expires(&ctx->t.tmr, texp); + ctx->t.tmr.function = timerfd_tmrproc; + } + if (texp.tv64 != 0) { - hrtimer_start(&ctx->tmr, texp, htmode); + if (isalarm(ctx)) { + if (flags & TFD_TIMER_ABSTIME) + alarm_start(&ctx->t.alarm, texp); + else + alarm_start_relative(&ctx->t.alarm, texp); + } else { + hrtimer_start(&ctx->t.tmr, texp, htmode); + } + if (timerfd_canceled(ctx)) return -ECANCELED; } @@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file) struct timerfd_ctx *ctx = file->private_data; timerfd_remove_cancel(ctx); - hrtimer_cancel(&ctx->tmr); + + if (isalarm(ctx)) + alarm_cancel(&ctx->t.alarm); + else + hrtimer_cancel(&ctx->t.tmr); kfree_rcu(ctx, rcu); return 0; } @@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, * callback to avoid DoS attacks specifying a very * short timer period. */ - ticks += hrtimer_forward_now(&ctx->tmr, - ctx->tintv) - 1; - hrtimer_restart(&ctx->tmr); + if (isalarm(ctx)) { + ticks += alarm_forward_now( + &ctx->t.alarm, ctx->tintv) - 1; + alarm_restart(&ctx->t.alarm); + } else { + ticks += hrtimer_forward_now(&ctx->t.tmr, + ctx->tintv) - 1; + hrtimer_restart(&ctx->t.tmr); + } } ctx->expired = 0; ctx->ticks = 0; @@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) if ((flags & ~TFD_CREATE_FLAGS) || (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME)) + clockid != CLOCK_REALTIME && + clockid != CLOCK_REALTIME_ALARM && + clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) init_waitqueue_head(&ctx->wqh); ctx->clockid = clockid; - hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + + if (isalarm(ctx)) + alarm_init(&ctx->t.alarm, + ctx->clockid == CLOCK_REALTIME_ALARM ? + ALARM_REALTIME : ALARM_BOOTTIME, + timerfd_alarmproc); + else + hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); + ctx->moffs = ktime_get_monotonic_offset(); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, @@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags, */ for (;;) { spin_lock_irq(&ctx->wqh.lock); - if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) - break; + + if (isalarm(ctx)) { + if (alarm_try_to_cancel(&ctx->t.alarm) >= 0) + break; + } else { + if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0) + break; + } spin_unlock_irq(&ctx->wqh.lock); cpu_relax(); } @@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags, * We do not update "ticks" and "expired" since the timer will be * re-programmed again in the following timerfd_setup() call. */ - if (ctx->expired && ctx->tintv.tv64) - hrtimer_forward_now(&ctx->tmr, ctx->tintv); + if (ctx->expired && ctx->tintv.tv64) { + if (isalarm(ctx)) + alarm_forward_now(&ctx->t.alarm, ctx->tintv); + else + hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); + } old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); old->it_interval = ktime_to_timespec(ctx->tintv); @@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t) spin_lock_irq(&ctx->wqh.lock); if (ctx->expired && ctx->tintv.tv64) { ctx->expired = 0; - ctx->ticks += - hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; - hrtimer_restart(&ctx->tmr); + + if (isalarm(ctx)) { + ctx->ticks += + alarm_forward_now( + &ctx->t.alarm, ctx->tintv) - 1; + alarm_restart(&ctx->t.alarm); + } else { + ctx->ticks += + hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) + - 1; + hrtimer_restart(&ctx->t.tmr); + } } t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); t->it_interval = ktime_to_timespec(ctx->tintv); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index de08c92f..6b4947f 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -346,38 +346,46 @@ static unsigned int vfs_dent_type(uint8_t type) * This means that UBIFS cannot support NFS which requires full * 'seekdir()'/'telldir()' support. */ -static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err, over = 0; + int err; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; - dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); - if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) + if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2) /* * The directory was seek'ed to a senseless position or there * are no more entries. */ return 0; - /* File positions 0 and 1 correspond to "." and ".." */ - if (file->f_pos == 0) { - ubifs_assert(!file->private_data); - over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); - if (over) - return 0; - file->f_pos = 1; + if (file->f_version == 0) { + /* + * The file was seek'ed, which means that @file->private_data + * is now invalid. This may also be just the first + * 'ubifs_readdir()' invocation, in which case + * @file->private_data is NULL, and the below code is + * basically a no-op. + */ + kfree(file->private_data); + file->private_data = NULL; } - if (file->f_pos == 1) { + /* + * 'generic_file_llseek()' unconditionally sets @file->f_version to + * zero, and we use this for detecting whether the file was seek'ed. + */ + file->f_version = 1; + + /* File positions 0 and 1 correspond to "." and ".." */ + if (ctx->pos < 2) { ubifs_assert(!file->private_data); - over = filldir(dirent, "..", 2, 1, - parent_ino(file->f_path.dentry), DT_DIR); - if (over) + if (!dir_emit_dots(file, ctx)) return 0; /* Find the first entry in TNC and save it */ @@ -389,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -397,17 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) if (!dent) { /* * The directory was seek'ed to and is now readdir'ed. - * Find the entry corresponding to @file->f_pos or the - * closest one. + * Find the entry corresponding to @ctx->pos or the closest one. */ - dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); + dent_key_init_hash(c, &key, dir->i_ino, ctx->pos); nm.name = NULL; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -419,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) ubifs_inode(dir)->creat_sqnum); nm.len = le16_to_cpu(dent->nlen); - over = filldir(dirent, dent->name, nm.len, file->f_pos, + if (!dir_emit(ctx, dent->name, nm.len, le64_to_cpu(dent->inum), - vfs_dent_type(dent->type)); - if (over) + vfs_dent_type(dent->type))) return 0; /* Switch to the next entry */ @@ -435,7 +441,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) } kfree(file->private_data); - file->f_pos = key_hash_flash(c, &dent->key); + ctx->pos = key_hash_flash(c, &dent->key); file->private_data = dent; cond_resched(); } @@ -448,18 +454,11 @@ out: kfree(file->private_data); file->private_data = NULL; - file->f_pos = 2; + /* 2 is a special value indicating that there are no more direntries */ + ctx->pos = 2; return 0; } -/* If a directory is seeked, we have to free saved readdir() state */ -static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence) -{ - kfree(file->private_data); - file->private_data = NULL; - return generic_file_llseek(file, offset, whence); -} - /* Free saved readdir() state when the directory is closed */ static int ubifs_dir_release(struct inode *dir, struct file *file) { @@ -1177,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = { }; const struct file_operations ubifs_dir_operations = { - .llseek = ubifs_dir_llseek, + .llseek = generic_file_llseek, .release = ubifs_dir_release, .read = generic_read_dir, - .readdir = ubifs_readdir, + .iterate = ubifs_readdir, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 1437453..123c79b 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) return err; } -static void ubifs_invalidatepage(struct page *page, unsigned long offset) +static void ubifs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; ubifs_assert(PagePrivate(page)); - if (offset) + if (offset || length < PAGE_CACHE_SIZE) /* Partial page remains dirty */ return; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f21acf0..879b997 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c) ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", c->vi.ubi_num, c->vi.vol_id, c->vi.name, - c->ro_mount ? ", R/O mode" : NULL); + c->ro_mount ? ", R/O mode" : ""); x = (long long)c->main_lebs * c->leb_size; y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", diff --git a/fs/udf/dir.c b/fs/udf/dir.c index b3e93f5..a012c51 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -35,14 +35,16 @@ #include "udf_i.h" #include "udf_sb.h" -static int do_udf_readdir(struct inode *dir, struct file *filp, - filldir_t filldir, void *dirent) + +static int udf_readdir(struct file *file, struct dir_context *ctx) { + struct inode *dir = file_inode(file); + struct udf_inode_info *iinfo = UDF_I(dir); struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; int block, iblock; - loff_t nf_pos = (filp->f_pos - 1) << 2; + loff_t nf_pos; int flen; unsigned char *fname = NULL; unsigned char *nameptr; @@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, uint32_t elen; sector_t offset; int i, num, ret = 0; - unsigned int dt_type; struct extent_position epos = { NULL, 0, {0, 0} }; - struct udf_inode_info *iinfo; + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + return 0; + ctx->pos = 1; + } + nf_pos = (ctx->pos - 1) << 2; if (nf_pos >= size) goto out; @@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, nf_pos = udf_ext0_offset(dir); fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); - iinfo = UDF_I(dir); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) @@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, } while (nf_pos < size) { - filp->f_pos = (nf_pos >> 2) + 1; + struct kernel_lb_addr tloc; + + ctx->pos = (nf_pos >> 2) + 1; fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset); @@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp, } if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { - iblock = parent_ino(filp->f_path.dentry); - flen = 2; - memcpy(fname, "..", flen); - dt_type = DT_DIR; - } else { - struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation); - - iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); - flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); - dt_type = DT_UNKNOWN; + if (!dir_emit_dotdot(file, ctx)) + goto out; + continue; } - if (flen && filldir(dirent, fname, flen, filp->f_pos, - iblock, dt_type) < 0) + flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); + if (!flen) + continue; + + tloc = lelb_to_cpu(cfi.icb.extLocation); + iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); + if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) goto out; } /* end while */ - filp->f_pos = (nf_pos >> 2) + 1; + ctx->pos = (nf_pos >> 2) + 1; out: if (fibh.sbh != fibh.ebh) @@ -184,27 +189,11 @@ out: return ret; } -static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct inode *dir = file_inode(filp); - int result; - - if (filp->f_pos == 0) { - if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) { - return 0; - } - filp->f_pos++; - } - - result = do_udf_readdir(dir, filp, filldir, dirent); - return result; -} - /* readdir and lookup functions */ const struct file_operations udf_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = udf_readdir, + .iterate = udf_readdir, .unlocked_ioctl = udf_ioctl, .fsync = generic_file_fsync, }; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 102c072..5f6fc17 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -594,6 +594,29 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, return 0; } +static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + struct udf_inode_info *iinfo; + int err; + + inode = udf_new_inode(dir, mode, &err); + if (!inode) + return err; + + iinfo = UDF_I(inode); + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + mark_inode_dirty(inode); + + d_tmpfile(dentry, inode); + return 0; +} + static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { @@ -1311,6 +1334,7 @@ const struct inode_operations udf_dir_inode_operations = { .rmdir = udf_rmdir, .mknod = udf_mknod, .rename = udf_rename, + .tmpfile = udf_tmpfile, }; const struct inode_operations udf_symlink_inode_operations = { .readlink = generic_readlink, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 3a75ca0..0ecc2ce 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base, * This is blatantly stolen from ext2fs */ static int -ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) +ufs_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = file_inode(filp); + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = ufs_dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); - int need_revalidate = filp->f_version != inode->i_version; + int need_revalidate = file->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; UFSD("BEGIN\n"); @@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) ufs_error(sb, __func__, "bad page in #%lu", inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; + ctx->pos += PAGE_CACHE_SIZE - offset; return -EIO; } kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); - filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; } - filp->f_version = inode->i_version; + file->f_version = inode->i_version; need_revalidate = 0; } de = (struct ufs_dir_entry *)(kaddr+offset); @@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) return -EIO; } if (de->d_ino) { - int over; unsigned char d_type = DT_UNKNOWN; - offset = (char *)de - kaddr; - UFSD("filldir(%s,%u)\n", de->d_name, fs32_to_cpu(sb, de->d_ino)); UFSD("namlen %u\n", ufs_get_de_namlen(sb, de)); @@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) d_type = de->d_u.d_44.d_type; - over = filldir(dirent, de->d_name, + if (!dir_emit(ctx, de->d_name, ufs_get_de_namlen(sb, de), - (n<<PAGE_CACHE_SHIFT) | offset, - fs32_to_cpu(sb, de->d_ino), d_type); - if (over) { + fs32_to_cpu(sb, de->d_ino), + d_type)) { ufs_put_page(page); return 0; } } - filp->f_pos += fs16_to_cpu(sb, de->d_reclen); + ctx->pos += fs16_to_cpu(sb, de->d_reclen); } ufs_put_page(page); } @@ -660,7 +656,7 @@ not_empty: const struct file_operations ufs_dir_operations = { .read = generic_read_dir, - .readdir = ufs_readdir, + .iterate = ufs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 6313b69..4a45080 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -71,6 +71,7 @@ xfs-y += xfs_alloc.o \ xfs_dir2_sf.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ + xfs_icreate_item.o \ xfs_inode.o \ xfs_log_recover.o \ xfs_mount.o \ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 1d32f1d..306d883 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -21,6 +21,8 @@ #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_vnodeops.h" +#include "xfs_sb.h" +#include "xfs_mount.h" #include "xfs_trace.h" #include <linux/slab.h> #include <linux/xattr.h> @@ -34,7 +36,9 @@ */ STATIC struct posix_acl * -xfs_acl_from_disk(struct xfs_acl *aclp) +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; @@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); - if (count > XFS_ACL_MAX_ENTRIES) + if (count > max_entries) return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); @@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type) struct xfs_inode *ip = XFS_I(inode); struct posix_acl *acl; struct xfs_acl *xfs_acl; - int len = sizeof(struct xfs_acl); unsigned char *ea_name; int error; + int len; acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) @@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type) * If we have a cached ACLs value just return it, not need to * go out to the disk. */ - - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return ERR_PTR(-ENOMEM); @@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type) goto out; } - acl = xfs_acl_from_disk(xfs_acl); + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); if (IS_ERR(acl)) goto out; @@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) if (acl) { struct xfs_acl *xfs_acl; - int len; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return -ENOMEM; xfs_acl_to_disk(xfs_acl, acl); - len = sizeof(struct xfs_acl) - - (sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES - acl->a_count)); + + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); @@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode) static int xfs_acl_exists(struct inode *inode, unsigned char *name) { - int len = sizeof(struct xfs_acl); + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); return (xfs_attr_get(XFS_I(inode), name, NULL, &len, ATTR_ROOT|ATTR_KERNOVAL) == 0); @@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, goto out_release; error = -EINVAL; - if (acl->a_count > XFS_ACL_MAX_ENTRIES) + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) goto out_release; if (type == ACL_TYPE_ACCESS) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 39632d9..4016a56 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -22,19 +22,36 @@ struct inode; struct posix_acl; struct xfs_inode; -#define XFS_ACL_MAX_ENTRIES 25 #define XFS_ACL_NOT_PRESENT (-1) /* On-disk XFS access control list structure */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + struct xfs_acl { - __be32 acl_cnt; - struct xfs_acl_entry { - __be32 ae_tag; - __be32 ae_id; - __be16 ae_perm; - } acl_entry[XFS_ACL_MAX_ENTRIES]; + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; }; +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + /* On-disk XFS extended attribute names */ #define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" #define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 5673bcf..71596e5 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -175,6 +175,7 @@ xfs_alloc_compute_diff( xfs_agblock_t wantbno, /* target starting block */ xfs_extlen_t wantlen, /* target length */ xfs_extlen_t alignment, /* target alignment */ + char userdata, /* are we allocating data? */ xfs_agblock_t freebno, /* freespace's starting block */ xfs_extlen_t freelen, /* freespace's length */ xfs_agblock_t *newbnop) /* result: best start block from free */ @@ -189,7 +190,14 @@ xfs_alloc_compute_diff( ASSERT(freelen >= wantlen); freeend = freebno + freelen; wantend = wantbno + wantlen; - if (freebno >= wantbno) { + /* + * We want to allocate from the start of a free extent if it is past + * the desired block or if we are allocating user data and the free + * extent is before desired block. The second case is there to allow + * for contiguous allocation from the remaining free space if the file + * grows in the short term. + */ + if (freebno >= wantbno || (userdata && freeend < wantend)) { if ((newbno1 = roundup(freebno, alignment)) >= freeend) newbno1 = NULLAGBLOCK; } else if (freeend >= wantend && alignment > 1) { @@ -805,7 +813,8 @@ xfs_alloc_find_best_extent( xfs_alloc_fix_len(args); sdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, *sbnoa, + args->alignment, + args->userdata, *sbnoa, *slena, &new); /* @@ -976,7 +985,8 @@ restart: if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbnoa, ltlena, <new); + args->alignment, args->userdata, ltbnoa, + ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { bdiff = ltdiff; @@ -1128,7 +1138,8 @@ restart: args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbnoa, ltlena, <new); + args->alignment, args->userdata, ltbnoa, + ltlena, <new); error = xfs_alloc_find_best_extent(args, &bno_cur_lt, &bno_cur_gt, @@ -1144,7 +1155,8 @@ restart: args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, gtbnoa, gtlena, >new); + args->alignment, args->userdata, gtbnoa, + gtlena, >new); error = xfs_alloc_find_best_extent(args, &bno_cur_gt, &bno_cur_lt, @@ -1203,7 +1215,7 @@ restart: } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - ltbnoa, ltlena, <new); + args->userdata, ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2b2691b..596ec71 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -725,6 +725,25 @@ xfs_convert_page( (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), PAGE_CACHE_SIZE); @@ -824,10 +843,12 @@ xfs_cluster_write( STATIC void xfs_vm_invalidatepage( struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { - trace_xfs_invalidatepage(page->mapping->host, page, offset); - block_invalidatepage(page, offset); + trace_xfs_invalidatepage(page->mapping->host, page, offset, + length); + block_invalidatepage(page, offset, length); } /* @@ -891,7 +912,7 @@ next_buffer: xfs_iunlock(ip, XFS_ILOCK_EXCL); out_invalidate: - xfs_vm_invalidatepage(page, 0); + xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); return; } @@ -921,7 +942,7 @@ xfs_vm_writepage( int count = 0; int nonblocking = 0; - trace_xfs_writepage(inode, page, 0); + trace_xfs_writepage(inode, page, 0, 0); ASSERT(page_has_buffers(page)); @@ -1152,7 +1173,7 @@ xfs_vm_releasepage( { int delalloc, unwritten; - trace_xfs_releasepage(page->mapping->host, page, 0); + trace_xfs_releasepage(page->mapping->host, page, 0, 0); xfs_count_page_state(page, &delalloc, &unwritten); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 08d5457..31d3cd1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -931,20 +931,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) */ int xfs_attr_shortform_allfit( - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_buf *bp, + struct xfs_inode *dp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; - int bytes, i; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); - entry = &leaf->entries[0]; bytes = sizeof(struct xfs_attr_sf_hdr); - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) @@ -954,15 +956,15 @@ xfs_attr_shortform_allfit( return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); - bytes += sizeof(struct xfs_attr_sf_entry)-1 + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + name_loc->namelen + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) - return(-1); - return(xfs_attr_shortform_bytesfit(dp, bytes)); + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); } /* @@ -1410,7 +1412,7 @@ xfs_attr3_leaf_add_work( name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; - args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -1443,11 +1445,12 @@ xfs_attr3_leaf_add_work( STATIC void xfs_attr3_leaf_compact( struct xfs_da_args *args, - struct xfs_attr3_icleaf_hdr *ichdr_d, + struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - struct xfs_attr3_icleaf_hdr ichdr_s; + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; @@ -1455,29 +1458,38 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; /* - * Copy basic information + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. */ - leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->b_addr; - ichdr_s = *ichdr_d; /* struct copy */ - ichdr_d->firstused = XFS_LBSIZE(mp); - ichdr_d->usedbytes = 0; - ichdr_d->count = 0; - ichdr_d->holes = 0; - ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s); - ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = XFS_LBSIZE(mp); + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0, - ichdr_s.count, mp); + xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, + ichdr_src.count, mp); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. @@ -2179,14 +2191,24 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP); - memset(tmp_leaf, 0, state->blocksize); - memset(&tmphdr, 0, sizeof(tmphdr)); + tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + memset(&tmphdr, 0, sizeof(tmphdr)); tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; tmphdr.firstused = state->blocksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, @@ -2330,9 +2352,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; + args->valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->valuelen); return XFS_ERROR(EEXIST); } } @@ -2383,7 +2407,8 @@ xfs_attr3_leaf_getvalue( ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; return 0; @@ -2709,7 +2734,8 @@ xfs_attr3_leaf_list_int( args.valuelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); if (retval) return retval; @@ -3232,7 +3258,7 @@ xfs_attr3_leaf_inactive( name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) { lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = XFS_B_TO_FSB(dp->i_mount, + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, be32_to_cpu(name_rmt->valuelen)); lp++; } diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index f9d7846..444a770 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -128,6 +128,7 @@ struct xfs_attr3_leaf_hdr { __u8 holes; __u8 pad1; struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ }; #define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index dee8446..ef6b0c1 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -47,22 +47,55 @@ * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ -static int +int xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, - mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); +} + +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; } static bool xfs_attr3_rmt_verify( - struct xfs_buf *bp) + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; @@ -70,7 +103,9 @@ xfs_attr3_rmt_verify( return false; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) return false; - if (bp->b_bn != be64_to_cpu(rmt->rm_blkno)) + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) @@ -86,17 +121,40 @@ xfs_attr3_rmt_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + bool corrupt = false; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF) || - !xfs_attr3_rmt_verify(bp)) { + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), + XFS_ATTR3_RMT_CRC_OFF)) { + corrupt = true; + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + corrupt = true; + break; + } + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; + } + + if (corrupt) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); - } + } else + ASSERT(len == 0); } static void @@ -105,23 +163,39 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_attr3_rmt_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; + + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF); - if (bip) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF); + ASSERT(len == 0); } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { @@ -129,15 +203,16 @@ const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .verify_write = xfs_attr3_rmt_write_verify, }; -static int +STATIC int xfs_attr3_rmt_hdr_set( struct xfs_mount *mp, + void *ptr, xfs_ino_t ino, uint32_t offset, uint32_t size, - struct xfs_buf *bp) + xfs_daddr_t bno) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return 0; @@ -147,36 +222,107 @@ xfs_attr3_rmt_hdr_set( rmt->rm_bytes = cpu_to_be32(size); uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); rmt->rm_owner = cpu_to_be64(ino); - rmt->rm_blkno = cpu_to_be64(bp->b_bn); - bp->b_ops = &xfs_attr3_rmt_buf_ops; + rmt->rm_blkno = cpu_to_be64(bno); return sizeof(struct xfs_attr3_rmt_hdr); } /* - * Checking of the remote attribute header is split into two parts. the verifier - * does CRC, location and bounds checking, the unpacking function checks the - * attribute parameters and owner. + * Helper functions to copy attribute data in and out of the one disk extents */ -static bool -xfs_attr3_rmt_hdr_ok( - struct xfs_mount *mp, - xfs_ino_t ino, - uint32_t offset, - uint32_t size, - struct xfs_buf *bp) +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **dst) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); - if (offset != be32_to_cpu(rmt->rm_offset)) - return false; - if (size != be32_to_cpu(rmt->rm_bytes)) - return false; - if (ino != be64_to_cpu(rmt->rm_owner)) - return false; + ASSERT(len >= XFS_LBSIZE(mp)); - /* ok */ - return true; + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min_t(int, *valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + src += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == XFS_LBSIZE(mp)); + memset(dst + hdr_size + byte_cnt, 0, + XFS_LBSIZE(mp) - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + dst += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } } /* @@ -190,13 +336,12 @@ xfs_attr_rmtval_get( struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; struct xfs_mount *mp = args->dp->i_mount; struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno = args->rmtblkno; - void *dst = args->value; + char *dst = args->value; int valuelen = args->valuelen; int nmap; int error; - int blkcnt; + int blkcnt = args->rmtblkcnt; int i; int offset = 0; @@ -207,52 +352,36 @@ xfs_attr_rmtval_get( while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, + blkcnt, map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; ASSERT(nmap >= 1); for (i = 0; (i < nmap) && (valuelen > 0); i++) { - int byte_cnt; - char *src; + xfs_daddr_t dblkno; + int dblkcnt; ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, + dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) return error; - byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length)); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); - - src = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino, - offset, byte_cnt, bp)) { - xfs_alert(mp, -"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", - offset, byte_cnt, args->dp->i_ino); - xfs_buf_relse(bp); - return EFSCORRUPTED; - - } - - src += sizeof(struct xfs_attr3_rmt_hdr); - } - - memcpy(dst, src, byte_cnt); + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); xfs_buf_relse(bp); + if (error) + return error; - offset += byte_cnt; - dst += byte_cnt; - valuelen -= byte_cnt; - + /* roll attribute extent map forwards */ lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; } } ASSERT(valuelen == 0); @@ -270,17 +399,13 @@ xfs_attr_rmtval_set( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; - void *src = args->value; + char *src = args->value; int blkcnt; int valuelen; int nmap; int error; - int hdrcnt = 0; - bool crcs = xfs_sb_version_hascrc(&mp->m_sb); int offset = 0; trace_xfs_attr_rmtval_set(args); @@ -289,24 +414,14 @@ xfs_attr_rmtval_set( * Find a "hole" in the attribute address space large enough for * us to drop the new attribute's value into. Because CRC enable * attributes have headers, we can't just do a straight byte to FSB - * conversion. We calculate the worst case block count in this case - * and we may not need that many, so we have to handle this when - * allocating the blocks below. + * conversion and have to take the header space into account. */ - if (!crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - else - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); - + blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) return error; - /* Start with the attribute data. We'll allocate the rest afterwards. */ - if (crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; @@ -349,26 +464,6 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; - hdrcnt++; - - /* - * If we have enough blocks for the attribute data, calculate - * how many extra blocks we need for headers. We might run - * through this multiple times in the case that the additional - * headers in the blocks needed for the data fragments spills - * into requiring more blocks. e.g. for 512 byte blocks, we'll - * spill for another block every 9 headers we require in this - * loop. - */ - if (crcs && blkcnt == 0) { - int total_len; - - total_len = args->valuelen + - hdrcnt * sizeof(struct xfs_attr3_rmt_hdr); - blkcnt = XFS_B_TO_FSB(mp, total_len); - blkcnt -= args->rmtblkcnt; - args->rmtblkcnt += blkcnt; - } /* * Start the next trans in the chain. @@ -385,18 +480,19 @@ xfs_attr_rmtval_set( * the INCOMPLETE flag. */ lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; valuelen = args->valuelen; while (valuelen > 0) { - int byte_cnt; - char *buf; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); - /* - * Try to remember where we decided to put the value. - */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); @@ -405,41 +501,27 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) return ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; - byte_cnt = BBTOB(bp->b_length); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); - if (valuelen < byte_cnt) - byte_cnt = valuelen; - - buf = bp->b_addr; - buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, - byte_cnt, bp); - memcpy(buf, src, byte_cnt); - - if (byte_cnt < BBTOB(bp->b_length)) - xfs_buf_zero(bp, byte_cnt, - BBTOB(bp->b_length) - byte_cnt); + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) return error; - src += byte_cnt; - valuelen -= byte_cnt; - offset += byte_cnt; - hdrcnt--; + /* roll attribute extent map forwards */ lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } ASSERT(valuelen == 0); - ASSERT(hdrcnt == 0); return 0; } @@ -448,33 +530,40 @@ xfs_attr_rmtval_set( * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove(xfs_da_args_t *args) +xfs_attr_rmtval_remove( + struct xfs_da_args *args) { - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; trace_xfs_attr_rmtval_remove(args); - mp = args->dp->i_mount; - /* - * Roll through the "value", invalidating the attribute value's - * blocks. + * Roll through the "value", invalidating the attribute value's blocks. + * Note that args->rmtblkcnt is the minimum number of data blocks we'll + * see for a CRC enabled remote attribute. Each extent will have a + * header, and so we may have more blocks than we realise here. If we + * fail to map the blocks correctly, we'll have problems with the buffer + * lookups. */ lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; - while (valuelen > 0) { + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; + /* * Try to remember where we decided to put the value. */ nmap = 1; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap == 1); @@ -482,21 +571,20 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } - valuelen -= map.br_blockcount; - lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } /* @@ -506,6 +594,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) blkcnt = args->rmtblkcnt; done = 0; while (!done) { + int committed; + xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h index c7cca60..92a8fd7 100644 --- a/fs/xfs/xfs_attr_remote.h +++ b/fs/xfs/xfs_attr_remote.h @@ -20,6 +20,14 @@ #define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ +/* + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ struct xfs_attr3_rmt_hdr { __be32 rm_magic; __be32 rm_offset; @@ -39,6 +47,8 @@ struct xfs_attr3_rmt_hdr { extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 70c43d9..1b726d6 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -196,6 +196,8 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; #define XFS_BMDR_SPACE_CALC(nrecs) \ (int)(sizeof(xfs_bmdr_block_t) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) +#define XFS_BMAP_BMDR_SPACE(bb) \ + (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) /* * Maximum number of bmap btree levels. diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 8804b8a..0903960 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -2544,7 +2544,17 @@ xfs_btree_new_iroot( if (error) goto error0; + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 82b70bd..1b2472a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -513,6 +513,7 @@ _xfs_buf_find( xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", __func__, blkno, eofs); + WARN_ON(1); return NULL; } @@ -1649,7 +1650,7 @@ xfs_alloc_buftarg( { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cf26347..bfc4e0c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -140,6 +140,16 @@ xfs_buf_item_size( ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + if (bip->bli_flags & XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. + * It is not being included in the transaction + * commit, so no vectors are used at all. + */ + trace_xfs_buf_item_size_ordered(bip); + return XFS_LOG_VEC_ORDERED; + } + /* * the vector count is based on the number of buffer vectors we have * dirty bits in. This will only be greater than one when we have a @@ -212,6 +222,7 @@ xfs_buf_item_format_segment( goto out; } + /* * Fill in an iovec for each set of contiguous chunks. */ @@ -262,12 +273,7 @@ xfs_buf_item_format_segment( vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* - * You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ + nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; @@ -304,18 +310,36 @@ xfs_buf_item_format( /* * If it is an inode buffer, transfer the in-memory state to the - * format flags and clear the in-memory state. We do not transfer + * format flags and clear the in-memory state. + * + * For buffer based inode allocation, we do not transfer * this state if the inode buffer allocation has not yet been committed * to the log as setting the XFS_BLI_INODE_BUF flag will prevent * correct replay of the inode allocation. + * + * For icreate item based inode allocation, the buffers aren't written + * to the journal during allocation, and hence we should always tag the + * buffer as an inode buffer so that the correct unlinked list replay + * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; bip->bli_flags &= ~XFS_BLI_INODE_BUF; } + if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == + XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so don't format it. + */ + trace_xfs_buf_item_format_ordered(bip); + return; + } + for (i = 0; i < bip->bli_format_count; i++) { vecp = xfs_buf_item_format_segment(bip, vecp, offset, &bip->bli_formats[i]); @@ -345,6 +369,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_ORDERED) || (bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_pin(bip); @@ -517,8 +542,9 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - int aborted, clean, i; - uint hold; + bool clean; + bool aborted; + int flags; /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; @@ -529,23 +555,21 @@ xfs_buf_item_unlock( * (cancelled) buffers at unpin time, but we'll never go through the * pin/unpin cycle if we abort inside commit. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; - + aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. + * Before possibly freeing the buf item, copy the per-transaction state + * so we can reference it safely later after clearing it from the + * buffer log item. */ - hold = bip->bli_flags & XFS_BLI_HOLD; - - /* Clear the per transaction state. */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); + flags = bip->bli_flags; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* * If the buf item is marked stale, then don't do anything. We'll * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (bip->bli_flags & XFS_BLI_STALE) { + if (flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -562,13 +586,19 @@ xfs_buf_item_unlock( * be the only reference to the buf item, so we free it anyway * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. + * + * Ordered buffers are dirty but may have no recorded changes, so ensure + * we only release clean items here. */ - clean = 1; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = 0; - break; + clean = (flags & XFS_BLI_DIRTY) ? false : true; + if (clean) { + int i; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = false; + break; + } } } if (clean) @@ -581,7 +611,7 @@ xfs_buf_item_unlock( } else atomic_dec(&bip->bli_refcount); - if (!hold) + if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); } @@ -847,12 +877,6 @@ xfs_buf_item_log( struct xfs_buf *bp = bip->bli_buf; /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - - /* * walk each buffer segment and mark them dirty appropriately. */ start = 0; @@ -878,7 +902,7 @@ xfs_buf_item_log( /* - * Return 1 if the buffer has some data that has been logged (at any + * Return 1 if the buffer has been logged or ordered in a transaction (at any * point, not just the current transaction) and 0 if not. */ uint @@ -912,11 +936,11 @@ void xfs_buf_item_relse( xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); - bip = bp->b_fspriv; bp->b_fspriv = bip->bli_item.li_bio_list; if (bp->b_fspriv == NULL) bp->b_iodone = NULL; diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 2573d2a..0f1c247 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -120,6 +120,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 #define XFS_BLI_INODE_BUF 0x40 +#define XFS_BLI_ORDERED 0x80 #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -128,7 +129,8 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) { XFS_BLI_LOGGED, "LOGGED" }, \ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ - { XFS_BLI_INODE_BUF, "INODE_BUF" } + { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ + { XFS_BLI_ORDERED, "ORDERED" } #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9b26a99..0b8b2a1 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -270,6 +270,7 @@ xfs_da3_node_read_verify( break; return; case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: bp->b_ops = &xfs_attr3_leaf_buf_ops; bp->b_ops->verify_read(bp); return; @@ -2464,7 +2465,8 @@ xfs_buf_map_from_irec( ASSERT(nirecs >= 1); if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); if (!map) return ENOMEM; *mapp = map; @@ -2520,7 +2522,8 @@ xfs_dabuf_map( * Optimize the one-block case. */ if (nfsb != 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); nirecs = nfsb; error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f852b08..e36445c 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -24,6 +24,9 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" @@ -182,7 +185,7 @@ xfs_swap_extents_check_format( */ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) return EINVAL; if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) @@ -192,9 +195,8 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return EINVAL; - if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return EINVAL; @@ -219,6 +221,14 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; + /* + * We have no way of updating owner information in the BMBT blocks for + * each inode on CRC enabled filesystems, so to avoid corrupting the + * this metadata we simply don't allow extent swaps to occur. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return XFS_ERROR(EINVAL); + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index b26a50f..8f023de 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -368,10 +368,8 @@ xfs_dir_removename( int xfs_readdir( xfs_inode_t *dp, - void *dirent, - size_t bufsize, - xfs_off_t *offset, - filldir_t filldir) + struct dir_context *ctx, + size_t bufsize) { int rval; /* return value */ int v; /* type-checking value */ @@ -385,14 +383,13 @@ xfs_readdir( XFS_STATS_INC(xs_dir_getdents); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir); + rval = xfs_dir2_sf_getdents(dp, ctx); else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) ; else if (v) - rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir); + rval = xfs_dir2_block_getdents(dp, ctx); else - rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset, - filldir); + rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); return rval; } diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index e59f5fc..09aea02 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -569,9 +569,7 @@ xfs_dir2_block_addname( int /* error */ xfs_dir2_block_getdents( xfs_inode_t *dp, /* incore inode */ - void *dirent, - xfs_off_t *offset, - filldir_t filldir) + struct dir_context *ctx) { xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ @@ -589,7 +587,7 @@ xfs_dir2_block_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) + if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) return 0; error = xfs_dir3_block_read(NULL, dp, &bp); @@ -600,7 +598,7 @@ xfs_dir2_block_getdents( * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ - wantoff = xfs_dir2_dataptr_to_off(mp, *offset); + wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos); hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* @@ -639,13 +637,12 @@ xfs_dir2_block_getdents( cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, (char *)dep - (char *)hdr); + ctx->pos = cook & 0x7fffffff; /* * If it didn't fit, set the final offset to here & return. */ - if (filldir(dirent, (char *)dep->name, dep->namelen, - cook & 0x7fffffff, be64_to_cpu(dep->inumber), - DT_UNKNOWN)) { - *offset = cook & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), DT_UNKNOWN)) { xfs_trans_brelse(NULL, bp); return 0; } @@ -655,7 +652,7 @@ xfs_dir2_block_getdents( * Reached the end of the block. * Set the offset to a non-existent block 1 and return. */ - *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 0x7fffffff; xfs_trans_brelse(NULL, bp); return 0; diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index a3b1bd8..7826782b 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -266,6 +266,7 @@ struct xfs_dir3_blk_hdr { struct xfs_dir3_data_hdr { struct xfs_dir3_blk_hdr hdr; xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ }; #define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) @@ -477,7 +478,7 @@ struct xfs_dir3_leaf_hdr { struct xfs_da3_blkinfo info; /* header for da routines */ __be16 count; /* count of entries */ __be16 stale; /* count of stale entries */ - __be32 pad; + __be32 pad; /* 64 bit alignment */ }; struct xfs_dir3_icleaf_hdr { @@ -715,6 +716,7 @@ struct xfs_dir3_free_hdr { __be32 firstdb; /* db of first entry */ __be32 nvalid; /* count of valid entries */ __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment */ }; struct xfs_dir3_free { diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 721ba2f..2aed25c 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1108,6 +1108,7 @@ xfs_dir2_leaf_readbuf( struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp = *bpp; struct xfs_bmbt_irec *map = mip->map; + struct blk_plug plug; int error = 0; int length; int i; @@ -1236,6 +1237,7 @@ xfs_dir2_leaf_readbuf( /* * Do we need more readahead? */ + blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; mip->ra_want > mip->ra_current && i < mip->map_blocks; i += mp->m_dirblkfsbs) { @@ -1287,6 +1289,7 @@ xfs_dir2_leaf_readbuf( } } } + blk_finish_plug(&plug); out: *bpp = bp; @@ -1300,10 +1303,8 @@ out: int /* error */ xfs_dir2_leaf_getdents( xfs_inode_t *dp, /* incore directory inode */ - void *dirent, - size_t bufsize, - xfs_off_t *offset, - filldir_t filldir) + struct dir_context *ctx, + size_t bufsize) { struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ @@ -1322,7 +1323,7 @@ xfs_dir2_leaf_getdents( * If the offset is at or past the largest allowed value, * give up right away. */ - if (*offset >= XFS_DIR2_MAX_DATAPTR) + if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) return 0; mp = dp->i_mount; @@ -1336,14 +1337,14 @@ xfs_dir2_leaf_getdents( mp->m_sb.sb_blocksize); map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP); + KM_SLEEP | KM_NOFS); map_info->map_size = length; /* * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ - curoff = xfs_dir2_dataptr_to_byte(mp, *offset); + curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos); /* * Force this conversion through db so we truncate the offset @@ -1444,8 +1445,8 @@ xfs_dir2_leaf_getdents( dep = (xfs_dir2_data_entry_t *)ptr; length = xfs_dir2_data_entsize(dep->namelen); - if (filldir(dirent, (char *)dep->name, dep->namelen, - xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, + ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), DT_UNKNOWN)) break; @@ -1462,9 +1463,9 @@ xfs_dir2_leaf_getdents( * All done. Set output offset value to current offset. */ if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) - *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; + ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else - *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; kmem_free(map_info); if (bp) xfs_trans_brelse(NULL, bp); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5246de4..2226a00 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -263,18 +263,19 @@ xfs_dir3_free_get_buf( * Initialize the new block to be empty, and remember * its first slot as our empty slot. */ - hdr.magic = XFS_DIR2_FREE_MAGIC; - hdr.firstdb = 0; - hdr.nused = 0; - hdr.nvalid = 0; + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; hdr.magic = XFS_DIR3_FREE_MAGIC; + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); - } + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); *bpp = bp; return 0; @@ -1921,8 +1922,6 @@ xfs_dir2_node_addname_int( */ freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * xfs_dir3_free_max_bests(mp); - free->hdr.nvalid = 0; - free->hdr.nused = 0; } else { free = fbp->b_addr; bests = xfs_dir3_free_bests_p(mp, free); diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 7cf573c..0511cda 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -33,8 +33,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args, extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern int xfs_dir2_block_addname(struct xfs_da_args *args); -extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, - xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_block_getdents(struct xfs_inode *dp, + struct dir_context *ctx); extern int xfs_dir2_block_lookup(struct xfs_da_args *args); extern int xfs_dir2_block_removename(struct xfs_da_args *args); extern int xfs_dir2_block_replace(struct xfs_da_args *args); @@ -91,8 +91,8 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_dir2_leaf_entry *ents, int *indexp, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); -extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, - size_t bufsize, xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx, + size_t bufsize); extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, struct xfs_buf **bpp, __uint16_t magic); extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, @@ -153,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, int size, xfs_dir2_sf_hdr_t *sfhp); extern int xfs_dir2_sf_addname(struct xfs_da_args *args); extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); -extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, - xfs_off_t *offset, filldir_t filldir); +extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 6157424..97676a3 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -768,9 +768,7 @@ xfs_dir2_sf_create( int /* error */ xfs_dir2_sf_getdents( xfs_inode_t *dp, /* incore directory inode */ - void *dirent, - xfs_off_t *offset, - filldir_t filldir) + struct dir_context *ctx) { int i; /* shortform entry number */ xfs_mount_t *mp; /* filesystem mount point */ @@ -802,7 +800,7 @@ xfs_dir2_sf_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) + if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) return 0; /* @@ -819,22 +817,20 @@ xfs_dir2_sf_getdents( /* * Put . entry unless we're starting past it. */ - if (*offset <= dot_offset) { - if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) { - *offset = dot_offset & 0x7fffffff; + if (ctx->pos <= dot_offset) { + ctx->pos = dot_offset & 0x7fffffff; + if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) return 0; - } } /* * Put .. entry unless we're starting past it. */ - if (*offset <= dotdot_offset) { + if (ctx->pos <= dotdot_offset) { ino = xfs_dir2_sf_get_parent_ino(sfp); - if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { - *offset = dotdot_offset & 0x7fffffff; + ctx->pos = dotdot_offset & 0x7fffffff; + if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) return 0; - } } /* @@ -845,21 +841,20 @@ xfs_dir2_sf_getdents( off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, xfs_dir2_sf_get_offset(sfep)); - if (*offset > off) { + if (ctx->pos > off) { sfep = xfs_dir2_sf_nextentry(sfp, sfep); continue; } ino = xfs_dir2_sfe_get_ino(sfp, sfep); - if (filldir(dirent, (char *)sfep->name, sfep->namelen, - off & 0x7fffffff, ino, DT_UNKNOWN)) { - *offset = off & 0x7fffffff; + ctx->pos = off & 0x7fffffff; + if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, + ino, DT_UNKNOWN)) return 0; - } sfep = xfs_dir2_sf_nextentry(sfp, sfep); } - *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & + ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 0x7fffffff; return 0; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a41f8bf..f01012d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -249,8 +249,11 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } xfs_trans_dquot_buf(tp, bp, @@ -286,23 +289,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; } -STATIC void -xfs_dquot_buf_calc_crc( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - int i; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) { - xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), - offsetof(struct xfs_dqblk, dd_crc)); - } -} - STATIC bool xfs_dquot_buf_verify_crc( struct xfs_mount *mp, @@ -328,12 +314,11 @@ xfs_dquot_buf_verify_crc( for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), - offsetof(struct xfs_dqblk, dd_crc))) + XFS_DQUOT_CRC_OFF)) return false; if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) return false; } - return true; } @@ -393,6 +378,11 @@ xfs_dquot_buf_read_verify( } } +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ void xfs_dquot_buf_write_verify( struct xfs_buf *bp) @@ -404,7 +394,6 @@ xfs_dquot_buf_write_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); return; } - xfs_dquot_buf_calc_crc(mp, bp); } const struct xfs_buf_ops xfs_dquot_buf_ops = { @@ -581,13 +570,13 @@ xfs_qm_dqtobp( xfs_buf_t **O_bpp, uint flags) { - xfs_bmbt_irec_t map; - int nmaps = 1, error; - xfs_buf_t *bp; - xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); - xfs_mount_t *mp = dqp->q_mount; - xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); - xfs_trans_t *tp = (tpp ? *tpp : NULL); + struct xfs_bmbt_irec map; + int nmaps = 1, error; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); + struct xfs_mount *mp = dqp->q_mount; + xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); + struct xfs_trans *tp = (tpp ? *tpp : NULL); dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; @@ -815,7 +804,7 @@ xfs_qm_dqget( xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; @@ -1151,11 +1140,17 @@ xfs_qm_dqflush( * copy the lsn into the on-disk dquot now while we have the in memory * dquot here. This can't be done later in the write verifier as we * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); } /* diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 4f0ebfc..b596626 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -143,10 +143,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo) -#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \ - XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ - XFS_DQ_TO_QINF(dqp)->qi_gquotaip) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c0f3750..452920a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -305,11 +305,12 @@ xfs_efi_release(xfs_efi_log_item_t *efip, { ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { - __xfs_efi_release(efip); - /* recovery needs us to drop the EFI reference, too */ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) __xfs_efi_release(efip); + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ } } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a5f2042..de3dc98 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -906,11 +906,10 @@ xfs_file_release( STATIC int xfs_file_readdir( - struct file *filp, - void *dirent, - filldir_t filldir) + struct file *file, + struct dir_context *ctx) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); xfs_inode_t *ip = XFS_I(inode); int error; size_t bufsize; @@ -929,8 +928,7 @@ xfs_file_readdir( */ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); - error = xfs_readdir(ip, dirent, bufsize, - (xfs_off_t *)&filp->f_pos, filldir); + error = xfs_readdir(ip, ctx, bufsize); if (error) return -error; return 0; @@ -1270,8 +1268,7 @@ xfs_seek_data( } out: - if (offset != file->f_pos) - file->f_pos = offset; + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: xfs_iunlock_map_shared(ip, lock); @@ -1379,8 +1376,7 @@ out: * situation in particular. */ offset = min_t(loff_t, offset, isize); - if (offset != file->f_pos) - file->f_pos = offset; + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: xfs_iunlock_map_shared(ip, lock); @@ -1432,7 +1428,7 @@ const struct file_operations xfs_file_operations = { const struct file_operations xfs_dir_file_operations = { .open = xfs_dir_open, .read = generic_read_dir, - .readdir = xfs_file_readdir, + .iterate = xfs_file_readdir, .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 6dda3f9..d046955 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ #define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 87595b2..614eb0c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -99,7 +99,9 @@ xfs_fs_geometry( (xfs_sb_version_hasattr2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -174,7 +176,7 @@ xfs_growfs_data_private( if (!bp) return EIO; if (bp->b_error) { - int error = bp->b_error; + error = bp->b_error; xfs_buf_relse(bp); return error; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index c8f5ae1..7a0c17d 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -38,6 +38,7 @@ #include "xfs_bmap.h" #include "xfs_cksum.h" #include "xfs_buf_item.h" +#include "xfs_icreate_item.h" /* @@ -150,12 +151,16 @@ xfs_check_agi_freecount( #endif /* - * Initialise a new set of inodes. + * Initialise a new set of inodes. When called without a transaction context + * (e.g. from recovery) we initiate a delayed write of the inode buffers rather + * than logging them (which in a transaction context puts them into the AIL + * for writeback rather than the xfsbufd queue). */ -STATIC int +int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -208,6 +213,18 @@ xfs_ialloc_inode_init( version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + + /* + * log the initialisation that is about to take place as an + * logical operation. This means the transaction does not + * need to log the physical changes to the inode buffers as log + * recovery will know what initialisation is actually needed. + * Hence we only need to log the buffers as "ordered" buffers so + * they track in the AIL as if they were physically logged. + */ + if (tp) + xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp), + mp->m_sb.sb_inodesize, length, gen); } else if (xfs_sb_version_hasnlink(&mp->m_sb)) version = 2; else @@ -223,13 +240,8 @@ xfs_ialloc_inode_init( XBF_UNMAPPED); if (!fbuf) return ENOMEM; - /* - * Initialize all inodes in this buffer and then log them. - * - * XXX: It would be much better if we had just one transaction - * to log a whole cluster of inodes instead of all the - * individual transactions causing a lot of log traffic. - */ + + /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < ninodes; i++) { @@ -247,18 +259,39 @@ xfs_ialloc_inode_init( ino++; uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); xfs_dinode_calc_crc(mp, free); - } else { + } else if (tp) { /* just log the inode core */ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); } } - if (version == 3) { - /* need to log the entire buffer */ - xfs_trans_log_buf(tp, fbuf, 0, - BBTOB(fbuf->b_length) - 1); + + if (tp) { + /* + * Mark the buffer as an inode allocation buffer so it + * sticks in AIL at the point of this allocation + * transaction. This ensures the they are on disk before + * the tail of the log can be moved past this + * transaction (i.e. by preventing relogging from moving + * it forward in the log). + */ + xfs_trans_inode_alloc_buf(tp, fbuf); + if (version == 3) { + /* + * Mark the buffer as ordered so that they are + * not physically logged in the transaction but + * still tracked in the AIL as part of the + * transaction and pin the log appropriately. + */ + xfs_trans_ordered_buf(tp, fbuf); + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); + } + } else { + fbuf->b_flags |= XBF_DONE; + xfs_buf_delwri_queue(fbuf, buffer_list); + xfs_buf_relse(fbuf); } - xfs_trans_inode_alloc_buf(tp, fbuf); } return 0; } @@ -303,7 +336,7 @@ xfs_ialloc_ag_alloc( * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. - */ + */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); @@ -402,7 +435,7 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, + error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, args.len, prandom_u32()); if (error) @@ -615,8 +648,7 @@ xfs_ialloc_get_rec( struct xfs_btree_cur *cur, xfs_agino_t agino, xfs_inobt_rec_incore_t *rec, - int *done, - int left) + int *done) { int error; int i; @@ -724,12 +756,12 @@ xfs_dialloc_ag( pag->pagl_leftrec != NULLAGINO && pag->pagl_rightrec != NULLAGINO) { error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, - &trec, &doneleft, 1); + &trec, &doneleft); if (error) goto error1; error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, - &rec, &doneright, 0); + &rec, &doneright); if (error) goto error1; } else { diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index c8da3df..68c0732 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +/* + * Inode chunk initialisation routine + */ +int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_agblock_t length, unsigned int gen); + extern const struct xfs_buf_ops xfs_agi_buf_ops; #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 96e344e..9560dc1f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -335,7 +335,8 @@ xfs_iget_cache_miss( iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) iflags |= XFS_IDONTCACHE; - ip->i_udquot = ip->i_gdquot = NULL; + ip->i_udquot = NULL; + ip->i_gdquot = NULL; xfs_iflags_set(ip, iflags); /* insert the new inode */ diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index e0f138c..a01afbb 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_eofblocks_worker(struct work_struct *); -int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags, void *args), diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c new file mode 100644 index 0000000..7716a4e --- /dev/null +++ b/fs/xfs/xfs_icreate_item.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2008-2010, 2013 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_trans_priv.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_error.h" +#include "xfs_icreate_item.h" + +kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_icreate_item, ic_item); +} + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We only need one iovec for the icreate log structure. + */ +STATIC uint +xfs_icreate_item_size( + struct xfs_log_item *lip) +{ + return 1; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given inode create log item. + */ +STATIC void +xfs_icreate_item_format( + struct xfs_log_item *lip, + struct xfs_log_iovec *log_vector) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + log_vector->i_addr = (xfs_caddr_t)&icp->ic_format; + log_vector->i_len = sizeof(struct xfs_icreate_log); + log_vector->i_type = XLOG_REG_TYPE_ICREATE; +} + + +/* Pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_pin( + struct xfs_log_item *lip) +{ +} + + +/* pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +STATIC void +xfs_icreate_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + if (icp->ic_item.li_flags & XFS_LI_ABORTED) + kmem_zone_free(xfs_icreate_zone, icp); + return; +} + +/* + * Because we have ordered buffers being tracked in the AIL for the inode + * creation, we don't need the create item after this. Hence we can free + * the log item and return -1 to tell the caller we're done with the item. + */ +STATIC xfs_lsn_t +xfs_icreate_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + kmem_zone_free(xfs_icreate_zone, icp); + return (xfs_lsn_t)-1; +} + +/* item can never get into the AIL */ +STATIC uint +xfs_icreate_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + ASSERT(0); + return XFS_ITEM_SUCCESS; +} + +/* Ordered buffers do the dependency tracking here, so this does nothing. */ +STATIC void +xfs_icreate_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static struct xfs_item_ops xfs_icreate_item_ops = { + .iop_size = xfs_icreate_item_size, + .iop_format = xfs_icreate_item_format, + .iop_pin = xfs_icreate_item_pin, + .iop_unpin = xfs_icreate_item_unpin, + .iop_push = xfs_icreate_item_push, + .iop_unlock = xfs_icreate_item_unlock, + .iop_committed = xfs_icreate_item_committed, + .iop_committing = xfs_icreate_item_committing, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the icreate item. + */ +void +xfs_icreate_log( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + unsigned int count, + unsigned int inode_size, + xfs_agblock_t length, + unsigned int generation) +{ + struct xfs_icreate_item *icp; + + icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + + xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, + &xfs_icreate_item_ops); + + icp->ic_format.icl_type = XFS_LI_ICREATE; + icp->ic_format.icl_size = 1; /* single vector */ + icp->ic_format.icl_ag = cpu_to_be32(agno); + icp->ic_format.icl_agbno = cpu_to_be32(agbno); + icp->ic_format.icl_count = cpu_to_be32(count); + icp->ic_format.icl_isize = cpu_to_be32(inode_size); + icp->ic_format.icl_length = cpu_to_be32(length); + icp->ic_format.icl_gen = cpu_to_be32(generation); + + xfs_trans_add_item(tp, &icp->ic_item); + tp->t_flags |= XFS_TRANS_DIRTY; + icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h new file mode 100644 index 0000000..88ba8aa --- /dev/null +++ b/fs/xfs/xfs_icreate_item.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2008-2010, Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_ICREATE_ITEM_H +#define XFS_ICREATE_ITEM_H 1 + +/* + * on disk log item structure + * + * Log recovery assumes the first two entries are the type and size and they fit + * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so + * decoding can be done correctly. + */ +struct xfs_icreate_log { + __uint16_t icl_type; /* type of log format structure */ + __uint16_t icl_size; /* size of log format structure */ + __be32 icl_ag; /* ag being allocated in */ + __be32 icl_agbno; /* start block of inode range */ + __be32 icl_count; /* number of inodes to initialise */ + __be32 icl_isize; /* size of inodes */ + __be32 icl_length; /* length of extent to initialise */ + __be32 icl_gen; /* inode generation number to use */ +}; + +/* in memory log item structure */ +struct xfs_icreate_item { + struct xfs_log_item ic_item; + struct xfs_icreate_log ic_format; +}; + +extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t agbno, unsigned int count, + unsigned int inode_size, xfs_agblock_t length, + unsigned int generation); + +#endif /* XFS_ICREATE_ITEM_H */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index efbe1ac..9ecfe1e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1028,6 +1028,11 @@ xfs_dinode_calc_crc( /* * Read the disk inode attributes into the in-core inode structure. + * + * If we are initialising a new inode and we are not utilising the + * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core + * with a random generation number. If we are keeping inodes around, we need to + * read the inode cluster to get the existing generation number off disk. */ int xfs_iread( @@ -1047,6 +1052,22 @@ xfs_iread( if (error) return error; + /* shortcut IO on inode allocation if possible */ + if ((iget_flags & XFS_IGET_CREATE) && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + /* initialise the on-disk inode core */ + memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_d.di_magic = XFS_DINODE_MAGIC; + ip->i_d.di_gen = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ip->i_d.di_version = 3; + ip->i_d.di_ino = ip->i_ino; + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + } else + ip->i_d.di_version = 2; + return 0; + } + /* * Get pointers to the on-disk inode and the buffer containing it. */ @@ -1133,17 +1154,16 @@ xfs_iread( xfs_buf_set_ref(bp, XFS_INO_REF); /* - * Use xfs_trans_brelse() to release the buffer containing the - * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal + * Use xfs_trans_brelse() to release the buffer containing the on-disk + * inode, because it was acquired with xfs_trans_read_buf() in + * xfs_imap_to_bp() above. If tp is NULL, this is just a normal * brelse(). If we're within a transaction, then xfs_trans_brelse() * will only release the buffer if it is not dirty within the * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be - * locking the new in-core inode before putting it in the hash - * table where other processes can find it. Thus we don't have - * to worry about the inode being changed just because we released - * the buffer. + * because inodes on disk are never destroyed and we will be locking the + * new in-core inode before putting it in the cache where other + * processes can find it. Thus we don't have to worry about the inode + * being changed just because we released the buffer. */ out_brelse: xfs_trans_brelse(tp, bp); @@ -1638,6 +1658,10 @@ xfs_iunlink( dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1723,6 +1747,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1796,6 +1824,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1809,6 +1841,10 @@ xfs_iunlink_remove( last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + xfs_trans_inode_buf(tp, last_ibp); xfs_trans_log_buf(tp, last_ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -2012,8 +2048,6 @@ xfs_ifree( int error; int delete; xfs_ino_t first_ino; - xfs_dinode_t *dip; - xfs_buf_t *ibp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -2026,14 +2060,13 @@ xfs_ifree( * Pull the on-disk inode from the AGI unlinked list. */ error = xfs_iunlink_remove(tp, ip); - if (error != 0) { + if (error) return error; - } error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); - if (error != 0) { + if (error) return error; - } + ip->i_d.di_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; @@ -2045,31 +2078,10 @@ xfs_ifree( * by reincarnations of this inode. */ ip->i_d.di_gen++; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) - return error; - - /* - * Clear the on-disk di_mode. This is to prevent xfs_bulkstat - * from picking up this inode when it is reclaimed (its incore state - * initialzed but not flushed to disk yet). The in-core di_mode is - * already cleared and a corresponding transaction logged. - * The hack here just synchronizes the in-core to on-disk - * di_mode value in advance before the actual inode sync to disk. - * This is OK because the inode is already unlinked and would never - * change its di_mode again for this inode generation. - * This is a temporary hack that would require a proper fix - * in the future. - */ - dip->di_mode = 0; - - if (delete) { + if (delete) error = xfs_ifree_cluster(ip, tp, first_ino); - } return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 8f8aaee..6a70964 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -284,6 +284,15 @@ xfs_iomap_eof_want_preallocate( return 0; /* + * If the file is smaller than the minimum prealloc and we are using + * dynamic preallocation, don't do any preallocation at all as it is + * likely this is the only write to the file that is going to be done. + */ + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) + return 0; + + /* * If there are any real blocks past eof, then don't * do any speculative allocation. */ @@ -345,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size( if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) return 0; + /* If the file is small, then use the minimum prealloc */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) + return 0; + /* * As we write multiple pages, the offset will always align to the * start of a page and hence point to a hole at EOF. i.e. if the size is diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d82efaa..c69bbc4 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -455,6 +455,28 @@ xfs_vn_getattr( return 0; } +static void +xfs_setattr_mode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -606,18 +628,8 @@ xfs_setattr_nonsize( /* * Change file access modes. */ - if (mask & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; - } + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); /* * Change file access or modified times. @@ -714,9 +726,8 @@ xfs_setattr_size( return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| - ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; @@ -860,6 +871,12 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; @@ -970,7 +987,8 @@ xfs_fiemap_format( if (bmv->bmv_oflags & BMV_OF_PREALLOC) fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= FIEMAP_EXTENT_DELALLOC; + fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); physical = 0; /* no block yet */ } if (bmv->bmv_oflags & BMV_OF_LAST) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 2ea7d40..bc92c53 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -43,7 +43,7 @@ xfs_internal_inum( { return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || (xfs_sb_version_hasquota(&mp->m_sb) && - (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); + xfs_is_quota_inode(&mp->m_sb, ino))); } /* @@ -383,11 +383,13 @@ xfs_bulkstat( * Also start read-ahead now for this chunk. */ if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + struct blk_plug plug; /* * Loop over all clusters in the next chunk. * Do a readahead if there are any allocated * inodes in that cluster. */ + blk_start_plug(&plug); agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; @@ -399,6 +401,7 @@ xfs_bulkstat( agbno, nbcluster, &xfs_inode_buf_ops); } + blk_finish_plug(&plug); irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b345a7c..d852a2b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length( headers++; for (lv = log_vector; lv; lv = lv->lv_next) { + /* we don't write ordered log vectors */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) + continue; + headers += lv->lv_niovecs; for (i = 0; i < lv->lv_niovecs; i++) { @@ -2216,7 +2220,7 @@ xlog_write( index = 0; lv = log_vector; vecp = lv->lv_iovecp; - while (lv && index < lv->lv_niovecs) { + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2236,13 +2240,22 @@ xlog_write( * This loop writes out as many regions as can fit in the amount * of space which was allocated by xlog_state_get_iclog_space(). */ - while (lv && index < lv->lv_niovecs) { - struct xfs_log_iovec *reg = &vecp[index]; + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; int start_rec_copy; int copy_len; int copy_off; + bool ordered = false; + + /* ordered log vectors have no regions to write */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(lv->lv_niovecs == 0); + ordered = true; + goto next_lv; + } + reg = &vecp[index]; ASSERT(reg->i_len % sizeof(__int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); @@ -2302,12 +2315,13 @@ xlog_write( break; if (++index == lv->lv_niovecs) { +next_lv: lv = lv->lv_next; index = 0; if (lv) vecp = lv->lv_iovecp; } - if (record_cnt == 0) { + if (record_cnt == 0 && ordered == false) { if (!lv) return 0; break; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 5caee96..fb630e4 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -88,7 +88,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XLOG_REG_TYPE_UNMOUNT 17 #define XLOG_REG_TYPE_COMMIT 18 #define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_MAX 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_MAX 20 typedef struct xfs_log_iovec { void *i_addr; /* beginning address of region */ @@ -105,6 +106,8 @@ struct xfs_log_vec { int lv_buf_len; /* size of formatted buffer */ }; +#define XFS_LOG_VEC_ORDERED (-1) + /* * Structure used to pass callback function and the function's argument * to the log manager. diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index e3d0b85..02b9cf3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs( int index; int len = 0; uint niovecs; + bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) @@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs( if (!niovecs) continue; + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + } + new_lv = kmem_zalloc(sizeof(*new_lv) + niovecs * sizeof(struct xfs_log_iovec), - KM_SLEEP); + KM_SLEEP|KM_NOFS); + + new_lv->lv_item = lidp->lid_item; + new_lv->lv_niovecs = niovecs; + if (ordered) { + /* track as an ordered logvec */ + new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto next; + } /* The allocated iovec region lies beyond the log vector. */ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; - new_lv->lv_niovecs = niovecs; - new_lv->lv_item = lidp->lid_item; /* build the vector array and calculate it's length */ IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); @@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs( } ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); +next: if (!ret_lv) ret_lv = new_lv; else @@ -191,8 +209,18 @@ xfs_cil_prepare_item( if (old) { /* existing lv on log item, space used is a delta */ - ASSERT(!list_empty(&lv->lv_item->li_cil)); - ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) || + old->lv_buf_len == XFS_LOG_VEC_ORDERED); + + /* + * If the new item is ordered, keep the old one that is already + * tracking dirty or ordered regions + */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(!lv->lv_buf); + kmem_free(lv); + return; + } *len += lv->lv_buf_len - old->lv_buf_len; *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; @@ -201,10 +229,11 @@ xfs_cil_prepare_item( } else { /* new lv, must pin the log item */ ASSERT(!lv->lv_item->li_lv); - ASSERT(list_empty(&lv->lv_item->li_cil)); - *len += lv->lv_buf_len; - *diff_iovecs += lv->lv_niovecs; + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + *len += lv->lv_buf_len; + *diff_iovecs += lv->lv_niovecs; + } IOP_PIN(lv->lv_item); } @@ -259,18 +288,24 @@ xlog_cil_insert_items( * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ - for (lv = log_vector; lv; lv = lv->lv_next) - xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); - - /* account for space used by new iovec headers */ - len += diff_iovecs * sizeof(xlog_op_header_t); - spin_lock(&cil->xc_cil_lock); + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + + ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil)); + lv->lv_next = NULL; - /* move the items to the tail of the CIL */ - for (lv = log_vector; lv; lv = lv->lv_next) + /* + * xfs_cil_prepare_item() may free the lv, so move the item on + * the CIL first. + */ list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); + xfs_cil_prepare_item(log, lv, &len, &diff_iovecs); + lv = next; + } + /* account for space used by new iovec headers */ + len += diff_iovecs * sizeof(xlog_op_header_t); ctx->nvecs += diff_iovecs; /* @@ -381,9 +416,7 @@ xlog_cil_push( struct xfs_cil_ctx *new_ctx; struct xlog_in_core *commit_iclog; struct xlog_ticket *tic; - int num_lv; int num_iovecs; - int len; int error = 0; struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; @@ -428,12 +461,9 @@ xlog_cil_push( * side which is currently locked out by the flush lock. */ lv = NULL; - num_lv = 0; num_iovecs = 0; - len = 0; while (!list_empty(&cil->xc_cil)) { struct xfs_log_item *item; - int i; item = list_first_entry(&cil->xc_cil, struct xfs_log_item, li_cil); @@ -444,11 +474,7 @@ xlog_cil_push( lv->lv_next = item->li_lv; lv = item->li_lv; item->li_lv = NULL; - - num_lv++; num_iovecs += lv->lv_niovecs; - for (i = 0; i < lv->lv_niovecs; i++) - len += lv->lv_iovecp[i].i_len; } /* @@ -701,6 +727,7 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = log->l_cilp->xc_ctx->sequence; + /* xlog_cil_insert_items() destroys log_vector list */ xlog_cil_insert_items(log, log_vector, tp->t_ticket); /* check we didn't blow the reservation */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 93f03ec..6fcc910a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -45,6 +45,7 @@ #include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_icreate_item.h" /* Need all the magic numbers and buffer ops structures from these headers */ #include "xfs_symlink.h" @@ -1599,10 +1600,53 @@ xlog_recover_add_to_trans( } /* - * Sort the log items in the transaction. Cancelled buffers need - * to be put first so they are processed before any items that might - * modify the buffers. If they are cancelled, then the modifications - * don't need to be replayed. + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. */ STATIC int xlog_recover_reorder_trans( @@ -1612,19 +1656,32 @@ xlog_recover_reorder_trans( { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); - list_move(&item->ri_list, &trans->r_itemq); + list_move(&item->ri_list, &cancel_list); break; } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); + break; + } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: @@ -1632,7 +1689,7 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); - list_move_tail(&item->ri_list, &trans->r_itemq); + list_move_tail(&item->ri_list, &inode_list); break; default: xfs_warn(log->l_mp, @@ -1643,6 +1700,14 @@ xlog_recover_reorder_trans( } } ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); return 0; } @@ -1794,7 +1859,13 @@ xlog_recover_do_inode_buffer( xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - bp->b_ops = &xfs_inode_buf_ops; + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { @@ -1861,6 +1932,15 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; @@ -2097,6 +2177,17 @@ xlog_recover_do_reg_buffer( ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. @@ -2134,7 +2225,16 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - xlog_recovery_validate_buf_type(mp, bp, buf_f); + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recovery_validate_buf_type(mp, bp, buf_f); } /* @@ -2255,6 +2355,12 @@ xfs_qm_dqcheck( d->dd_diskdq.d_flags = type; d->dd_diskdq.d_id = cpu_to_be32(id); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + return errs; } @@ -2782,6 +2888,10 @@ xlog_recover_dquot_pass2( } memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); @@ -2886,6 +2996,93 @@ xlog_recover_efd_pass2( } /* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return EINVAL; + } + + /* existing allocation is fixed value */ + ASSERT(count == XFS_IALLOC_INODES(mp)); + ASSERT(length == XFS_IALLOC_BLOCKS(mp)); + if (count != XFS_IALLOC_INODES(mp) || + length != XFS_IALLOC_BLOCKS(mp)) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return EINVAL; + } + + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); + return 0; +} + +/* * Free up any resources allocated by the transaction * * Remember that EFIs, EFDs, and IUNLINKs are handled later. @@ -2927,6 +3124,7 @@ xlog_recover_commit_pass1( case XFS_LI_EFI: case XFS_LI_EFD: case XFS_LI_DQUOT: + case XFS_LI_ICREATE: /* nothing to do in pass 1 */ return 0; default: @@ -2957,6 +3155,8 @@ xlog_recover_commit_pass2( return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f6bfbd7..2b0ba35 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -314,7 +314,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - bool check_inprogress) + bool check_inprogress, + bool check_version) { /* @@ -335,11 +336,20 @@ xfs_mount_validate_sb( return XFS_ERROR(EWRONGFS); } + if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) && + (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | + XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) { + xfs_notice(mp, +"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n"); + return XFS_ERROR(EFSCORRUPTED); + } + /* * Version 5 superblock feature mask validation. Reject combinations the - * kernel cannot support up front before checking anything else. + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. */ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { xfs_alert(mp, "Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" "Use of these features in this kernel is at your own risk!"); @@ -559,6 +569,18 @@ out_unwind: return error; } +static void +xfs_sb_quota_from_disk(struct xfs_sb *sbp) +{ + if (sbp->sb_qflags & XFS_OQUOTA_ENFD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; + if (sbp->sb_qflags & XFS_OQUOTA_CHKD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; + sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); +} + void xfs_sb_from_disk( struct xfs_sb *to, @@ -620,6 +642,35 @@ xfs_sb_from_disk( to->sb_lsn = be64_to_cpu(from->sb_lsn); } +static inline void +xfs_sb_quota_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t *fields) +{ + __uint16_t qflags = from->sb_qflags; + + if (*fields & XFS_SB_QFLAGS) { + /* + * The in-core version of sb_qflags do not have + * XFS_OQUOTA_* flags, whereas the on-disk version + * does. So, convert incore XFS_{PG}QUOTA_* flags + * to on-disk XFS_OQUOTA_* flags. + */ + qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | + XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); + + if (from->sb_qflags & + (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) + qflags |= XFS_OQUOTA_ENFD; + if (from->sb_qflags & + (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) + qflags |= XFS_OQUOTA_CHKD; + to->sb_qflags = cpu_to_be16(qflags); + *fields &= ~XFS_SB_QFLAGS; + } +} + /* * Copy in core superblock to ondisk one. * @@ -641,6 +692,7 @@ xfs_sb_to_disk( if (!fields) return; + xfs_sb_quota_to_disk(to, from, &fields); while (fields) { f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); first = xfs_sb_info[f].offset; @@ -675,7 +727,8 @@ xfs_sb_to_disk( static int xfs_sb_verify( - struct xfs_buf *bp) + struct xfs_buf *bp, + bool check_version) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; @@ -686,7 +739,8 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); } /* @@ -719,7 +773,7 @@ xfs_sb_read_verify( goto out_error; } } - error = xfs_sb_verify(bp); + error = xfs_sb_verify(bp, true); out_error: if (error) { @@ -758,7 +812,7 @@ xfs_sb_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; int error; - error = xfs_sb_verify(bp); + error = xfs_sb_verify(bp, false); if (error) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, error); @@ -831,6 +885,7 @@ reread: */ xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); + xfs_sb_quota_from_disk(&mp->m_sb); /* * We must be able to do sector-sized and sector-aligned IO. */ @@ -983,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp) */ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || (BBTOB(mp->m_swidth) & mp->m_blockmask)) { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "(sunit/swidth vs. blocksize)"); - return XFS_ERROR(EINVAL); - } - mp->m_dalign = mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + sbp->sb_blocksize); + return XFS_ERROR(EINVAL); } else { /* * Convert the stripe unit and width to FSBs. */ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "(sunit/swidth vs. ag size)"); - return XFS_ERROR(EINVAL); - } xfs_warn(mp, - "stripe alignment turned off: sunit(%d)/swidth(%d) " - "incompatible with agsize(%d)", - mp->m_dalign, mp->m_swidth, - sbp->sb_agblocks); - - mp->m_dalign = 0; - mp->m_swidth = 0; + "alignment check failed: sunit/swidth vs. agsize(%d)", + sbp->sb_agblocks); + return XFS_ERROR(EINVAL); } else if (mp->m_dalign) { mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_warn(mp, "alignment check failed: " - "sunit(%d) less than bsize(%d)", - mp->m_dalign, - mp->m_blockmask +1); - return XFS_ERROR(EINVAL); - } - mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, sbp->sb_blocksize); + return XFS_ERROR(EINVAL); } } @@ -1035,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp) sbp->sb_width = mp->m_swidth; mp->m_update_flags |= XFS_SB_WIDTH; } + } else { + xfs_warn(mp, + "cannot change alignment: superblock does not support data alignment"); + return XFS_ERROR(EINVAL); } } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b004cec..4e374d4 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -192,8 +192,6 @@ typedef struct xfs_mount { xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ uint m_chsize; /* size of next field */ - struct xfs_chash *m_chash; /* fs private inode per-cluster - * hash table */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ @@ -229,8 +227,6 @@ typedef struct xfs_mount { operations, typically for disk errors in metadata */ #define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ -#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to - user */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment allocations */ #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f41702b..7a3e007 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -41,6 +41,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" /* * The global quota manager. There is only one of these for the entire @@ -69,7 +70,7 @@ xfs_qm_dquot_walk( void *data) { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); uint32_t next_index; int last_error = 0; int skipped; @@ -188,7 +189,7 @@ xfs_qm_dqpurge( xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; @@ -298,8 +299,10 @@ xfs_qm_mount_quotas( */ if (!XFS_IS_UQUOTA_ON(mp)) mp->m_qflags &= ~XFS_UQUOTA_CHKD; - if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) - mp->m_qflags &= ~XFS_OQUOTA_CHKD; + if (!XFS_IS_GQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_GQUOTA_CHKD; + if (!XFS_IS_PQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_PQUOTA_CHKD; write_changes: /* @@ -488,8 +491,7 @@ xfs_qm_need_dqattach( return false; if (!XFS_NOT_DQATTACHED(mp, ip)) return false; - if (ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return false; return true; } @@ -605,8 +607,7 @@ xfs_qm_dqdetach( trace_xfs_dquot_dqdetach(ip); - ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); - ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); + ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino)); if (ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; @@ -839,7 +840,7 @@ xfs_qm_reset_dqcounts( xfs_dqid_t id, uint type) { - xfs_disk_dquot_t *ddq; + struct xfs_dqblk *dqb; int j; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -853,8 +854,12 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = bp->b_addr; + dqb = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -871,7 +876,12 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; - ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } } @@ -907,19 +917,29 @@ xfs_qm_dqiter_bufs( XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); - if (error) - break; /* - * XXX(hch): need to figure out if it makes sense to validate - * the CRC here. + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. */ + if (error == EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + + if (error) + break; + xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); - /* - * goto the next block. - */ + + /* goto the next block. */ bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } @@ -1132,7 +1152,7 @@ xfs_qm_dqusage_adjust( * rootino must have its resources accounted for, not so with the quota * inodes. */ - if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { + if (xfs_is_quota_inode(&mp->m_sb, ino)) { *res = BULKSTAT_RV_NOTHING; return XFS_ERROR(EINVAL); } @@ -1242,19 +1262,20 @@ int xfs_qm_quotacheck( xfs_mount_t *mp) { - int done, count, error, error2; - xfs_ino_t lastino; - size_t structsz; - xfs_inode_t *uip, *gip; - uint flags; - LIST_HEAD (buffer_list); + int done, count, error, error2; + xfs_ino_t lastino; + size_t structsz; + uint flags; + LIST_HEAD (buffer_list); + struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; + struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; count = INT_MAX; structsz = 1; lastino = 0; flags = 0; - ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); + ASSERT(uip || gip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); xfs_notice(mp, "Quotacheck needed: Please wait."); @@ -1264,7 +1285,6 @@ xfs_qm_quotacheck( * their counters to zero. We need a clean slate. * We don't log our changes till later. */ - uip = mp->m_quotainfo->qi_uquotaip; if (uip) { error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, &buffer_list); @@ -1273,14 +1293,14 @@ xfs_qm_quotacheck( flags |= XFS_UQUOTA_CHKD; } - gip = mp->m_quotainfo->qi_gquotaip; if (gip) { error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA, &buffer_list); if (error) goto error_return; - flags |= XFS_OQUOTA_CHKD; + flags |= XFS_IS_GQUOTA_ON(mp) ? + XFS_GQUOTA_CHKD : XFS_PQUOTA_CHKD; } do { @@ -1375,15 +1395,13 @@ STATIC int xfs_qm_init_quotainos( xfs_mount_t *mp) { - xfs_inode_t *uip, *gip; - int error; - __int64_t sbflags; - uint flags; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + int error; + __int64_t sbflags = 0; + uint flags = 0; ASSERT(mp->m_quotainfo); - uip = gip = NULL; - sbflags = 0; - flags = 0; /* * Get the uquota and gquota inodes @@ -1392,19 +1410,18 @@ xfs_qm_init_quotainos( if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); - if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip))) + error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip); + if (error) return XFS_ERROR(error); } if (XFS_IS_OQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); - if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip))) { - if (uip) - IRELE(uip); - return XFS_ERROR(error); - } + error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip); + if (error) + goto error_rele; } } else { flags |= XFS_QMOPT_SBVERSION; @@ -1419,10 +1436,11 @@ xfs_qm_init_quotainos( * temporarily switch to read-write to do this. */ if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { - if ((error = xfs_qm_qino_alloc(mp, &uip, + error = xfs_qm_qino_alloc(mp, &uip, sbflags | XFS_SB_UQUOTINO, - flags | XFS_QMOPT_UQUOTA))) - return XFS_ERROR(error); + flags | XFS_QMOPT_UQUOTA); + if (error) + goto error_rele; flags &= ~XFS_QMOPT_SBVERSION; } @@ -1431,18 +1449,21 @@ xfs_qm_init_quotainos( XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); error = xfs_qm_qino_alloc(mp, &gip, sbflags | XFS_SB_GQUOTINO, flags); - if (error) { - if (uip) - IRELE(uip); - - return XFS_ERROR(error); - } + if (error) + goto error_rele; } mp->m_quotainfo->qi_uquotaip = uip; mp->m_quotainfo->qi_gquotaip = gip; return 0; + +error_rele: + if (uip) + IRELE(uip); + if (gip) + IRELE(gip); + return XFS_ERROR(error); } STATIC void @@ -1453,7 +1474,7 @@ xfs_qm_dqfree_one( struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); - radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; @@ -1639,7 +1660,8 @@ xfs_qm_vop_dqalloc( struct xfs_dquot **O_gdqpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_dquot *uq, *gq; + struct xfs_dquot *uq = NULL; + struct xfs_dquot *gq = NULL; int error; uint lockflags; @@ -1664,7 +1686,6 @@ xfs_qm_vop_dqalloc( } } - uq = gq = NULL; if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { if (ip->i_d.di_uid != uid) { /* @@ -1677,11 +1698,12 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, XFS_DQ_USER, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &uq))) { + &uq); + if (error) { ASSERT(error != ENOENT); return error; } @@ -1703,15 +1725,14 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, XFS_DQ_GROUP, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &gq))) { - if (uq) - xfs_qm_dqrele(uq); + &gq); + if (error) { ASSERT(error != ENOENT); - return error; + goto error_rele; } xfs_dqunlock(gq); lockflags = XFS_ILOCK_SHARED; @@ -1723,15 +1744,14 @@ xfs_qm_vop_dqalloc( } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, XFS_DQ_PROJ, XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &gq))) { - if (uq) - xfs_qm_dqrele(uq); + &gq); + if (error) { ASSERT(error != ENOENT); - return (error); + goto error_rele; } xfs_dqunlock(gq); lockflags = XFS_ILOCK_SHARED; @@ -1754,6 +1774,11 @@ xfs_qm_vop_dqalloc( else if (gq) xfs_qm_dqrele(gq); return 0; + +error_rele: + if (uq) + xfs_qm_dqrele(uq); + return error; } /* @@ -1801,29 +1826,31 @@ xfs_qm_vop_chown( */ int xfs_qm_vop_chown_reserve( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp, - uint flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + uint flags) { - xfs_mount_t *mp = ip->i_mount; - uint delblks, blkflags, prjflags = 0; - xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; - int error; + struct xfs_mount *mp = ip->i_mount; + uint delblks, blkflags, prjflags = 0; + struct xfs_dquot *udq_unres = NULL; + struct xfs_dquot *gdq_unres = NULL; + struct xfs_dquot *udq_delblks = NULL; + struct xfs_dquot *gdq_delblks = NULL; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); delblks = ip->i_delayed_blks; - delblksudq = delblksgdq = unresudq = unresgdq = NULL; blkflags = XFS_IS_REALTIME_INODE(ip) ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { - delblksudq = udqp; + udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to * unreserve those from the old dquot, and add them to the @@ -1831,7 +1858,7 @@ xfs_qm_vop_chown_reserve( */ if (delblks) { ASSERT(ip->i_udquot); - unresudq = ip->i_udquot; + udq_unres = ip->i_udquot; } } if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { @@ -1842,18 +1869,19 @@ xfs_qm_vop_chown_reserve( if (prjflags || (XFS_IS_GQUOTA_ON(ip->i_mount) && ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { - delblksgdq = gdqp; + gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); - unresgdq = ip->i_gdquot; + gdq_unres = ip->i_gdquot; } } } - if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, - delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, - flags | blkflags | prjflags))) - return (error); + error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, + udq_delblks, gdq_delblks, ip->i_d.di_nblocks, 1, + flags | blkflags | prjflags); + if (error) + return error; /* * Do the delayed blks reservations/unreservations now. Since, these @@ -1865,14 +1893,15 @@ xfs_qm_vop_chown_reserve( /* * Do the reservations first. Unreservation can't fail. */ - ASSERT(delblksudq || delblksgdq); - ASSERT(unresudq || unresgdq); - if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, - flags | blkflags | prjflags))) - return (error); + ASSERT(udq_delblks || gdq_delblks); + ASSERT(udq_unres || gdq_unres); + error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_delblks, gdq_delblks, (xfs_qcnt_t)delblks, 0, + flags | blkflags | prjflags); + if (error) + return error; xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, + udq_unres, gdq_unres, -((xfs_qcnt_t)delblks), 0, blkflags); } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 5d16a6e..bdb4f8b 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -69,30 +69,62 @@ typedef struct xfs_quotainfo { struct shrinker qi_shrinker; } xfs_quotainfo_t; -#define XFS_DQUOT_TREE(qi, type) \ - ((type & XFS_DQ_USER) ? \ - &((qi)->qi_uquota_tree) : \ - &((qi)->qi_gquota_tree)) +static inline struct radix_tree_root * +xfs_dquot_tree( + struct xfs_quotainfo *qi, + int type) +{ + switch (type) { + case XFS_DQ_USER: + return &qi->qi_uquota_tree; + case XFS_DQ_GROUP: + case XFS_DQ_PROJ: + return &qi->qi_gquota_tree; + default: + ASSERT(0); + } + return NULL; +} +static inline struct xfs_inode * +xfs_dq_to_quota_inode(struct xfs_dquot *dqp) +{ + switch (dqp->dq_flags & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return dqp->q_mount->m_quotainfo->qi_uquotaip; + case XFS_DQ_GROUP: + case XFS_DQ_PROJ: + return dqp->q_mount->m_quotainfo->qi_gquotaip; + default: + ASSERT(0); + } + return NULL; +} extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp, unsigned int nbblks); -extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); -extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, - xfs_dquot_t *, xfs_dquot_t *, long, long, uint); -extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); -extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); +extern void xfs_trans_mod_dquot(struct xfs_trans *, + struct xfs_dquot *, uint, long); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, long, long, uint); +extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); +extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); /* * We keep the usr and grp dquots separately so that locking will be easier * to do at commit time. All transactions that we know of at this point * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. */ +enum { + XFS_QM_TRANS_USR = 0, + XFS_QM_TRANS_GRP, + XFS_QM_TRANS_DQTYPES +}; #define XFS_QM_TRANS_MAXDQS 2 -typedef struct xfs_dquot_acct { - xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; - xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; -} xfs_dquot_acct_t; +struct xfs_dquot_acct { + struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; +}; /* * Users are allowed to have a usage exceeding their softlimit for @@ -106,22 +138,23 @@ typedef struct xfs_dquot_acct { #define XFS_QM_IWARNLIMIT 5 #define XFS_QM_RTBWARNLIMIT 5 -extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); -extern int xfs_qm_quotacheck(xfs_mount_t *); -extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); +extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); +extern int xfs_qm_quotacheck(struct xfs_mount *); +extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t); /* dquot stuff */ -extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint); -extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); +extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); +extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ -extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); -extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); +extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct fs_disk_quota *); extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, - fs_disk_quota_t *); -extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); -extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); -extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); + struct fs_disk_quota *); +extern int xfs_qm_scall_getqstat(struct xfs_mount *, + struct fs_quota_stat *); +extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); +extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index c41190c..a08801a 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -117,11 +117,11 @@ xfs_qm_scall_quotaoff( } if (flags & XFS_GQUOTA_ACCT) { dqtype |= XFS_QMOPT_GQUOTA; - flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); inactivate_flags |= XFS_GQUOTA_ACTIVE; } else if (flags & XFS_PQUOTA_ACCT) { dqtype |= XFS_QMOPT_PQUOTA; - flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); inactivate_flags |= XFS_PQUOTA_ACTIVE; } @@ -335,14 +335,14 @@ xfs_qm_scall_quotaon( * quota acct on ondisk without m_qflags' knowing. */ if (((flags & XFS_UQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && - (flags & XFS_UQUOTA_ENFD)) - || + (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + (flags & XFS_UQUOTA_ENFD)) || + ((flags & XFS_GQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ENFD)) || ((flags & XFS_PQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && - (flags & XFS_GQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && - (flags & XFS_OQUOTA_ENFD))) { + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + (flags & XFS_PQUOTA_ENFD))) { xfs_debug(mp, "%s: Can't enforce without acct, flags=%x sbflags=%x\n", __func__, flags, mp->m_sb.sb_qflags); @@ -407,11 +407,11 @@ xfs_qm_scall_getqstat( struct fs_quota_stat *out) { struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_inode *uip, *gip; - bool tempuqip, tempgqip; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + bool tempuqip = false; + bool tempgqip = false; - uip = gip = NULL; - tempuqip = tempgqip = false; memset(out, 0, sizeof(fs_quota_stat_t)); out->qs_version = FS_QSTAT_VERSION; @@ -489,31 +489,36 @@ xfs_qm_scall_setqlim( if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) return 0; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - return (error); - } - /* * We don't want to race with a quotaoff so take the quotaoff lock. - * (We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening). (XXXThis doesn't currently happen - * because we take the vfslock before calling xfs_qm_sysent). + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. */ mutex_lock(&q->qi_quotaofflock); /* - * Get the dquot (locked), and join it to the transaction. - * Allocate the dquot if this doesn't exist. + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { ASSERT(error != ENOENT); goto out_unlock; } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -621,9 +626,10 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp, 0); - xfs_qm_dqrele(dqp); - out_unlock: +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: mutex_unlock(&q->qi_quotaofflock); return error; } @@ -770,9 +776,12 @@ xfs_qm_scall_getquota( * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ - if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) || - (!XFS_IS_OQUOTA_ENFORCED(mp) && - (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_USER) || + (!XFS_IS_GQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_GROUP) || + (!XFS_IS_PQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_PROJ)) { dst->d_btimer = 0; dst->d_itimer = 0; dst->d_rtbtimer = 0; @@ -780,8 +789,8 @@ xfs_qm_scall_getquota( #ifdef DEBUG if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || - (XFS_IS_OQUOTA_ENFORCED(mp) && - (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && + (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) || + (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) && dst->d_id != 0) { if ((dst->d_bcount > dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { @@ -827,16 +836,16 @@ xfs_qm_export_flags( uflags = 0; if (flags & XFS_UQUOTA_ACCT) uflags |= FS_QUOTA_UDQ_ACCT; - if (flags & XFS_PQUOTA_ACCT) - uflags |= FS_QUOTA_PDQ_ACCT; if (flags & XFS_GQUOTA_ACCT) uflags |= FS_QUOTA_GDQ_ACCT; + if (flags & XFS_PQUOTA_ACCT) + uflags |= FS_QUOTA_PDQ_ACCT; if (flags & XFS_UQUOTA_ENFD) uflags |= FS_QUOTA_UDQ_ENFD; - if (flags & (XFS_OQUOTA_ENFD)) { - uflags |= (flags & XFS_GQUOTA_ACCT) ? - FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; - } + if (flags & XFS_GQUOTA_ENFD) + uflags |= FS_QUOTA_GDQ_ENFD; + if (flags & XFS_PQUOTA_ENFD) + uflags |= FS_QUOTA_PDQ_ENFD; return (uflags); } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index c61e31c..c3483ba 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -87,6 +87,8 @@ typedef struct xfs_dqblk { uuid_t dd_uuid; /* location information */ } xfs_dqblk_t; +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + /* * flags for q_flags field in the dquot. */ @@ -159,30 +161,42 @@ typedef struct xfs_qoff_logformat { #define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ /* + * Conversion to and from the combined OQUOTA flag (if necessary) + * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() + */ +#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ +#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ +#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ +#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ + +/* * Quota Accounting/Enforcement flags */ #define XFS_ALL_QUOTA_ACCT \ (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) -#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) -#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) +#define XFS_ALL_QUOTA_ENFD \ + (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD \ + (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) #define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) #define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) #define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) #define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) #define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) -#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD) +#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) +#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) /* * Incore only flags for quotaoff - these bits get cleared when quota(s) * are in the process of getting turned off. These flags are in m_qflags but * never in sb_qflags. */ -#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ +#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ #define XFS_ALL_QUOTA_ACTIVE \ - (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) + (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) /* * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees @@ -266,24 +280,23 @@ typedef struct xfs_qoff_logformat { ((XFS_IS_UQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ (XFS_IS_GQUOTA_ON(mp) && \ - ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \ + (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \ (XFS_IS_PQUOTA_ON(mp) && \ - ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT)))) + (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) #define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD) #define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) + XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ + XFS_PQUOTA_ENFD|XFS_PQUOTA_CHKD) #define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ - XFS_GQUOTA_ACCT) + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ + XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ + XFS_PQUOTA_CHKD) /* diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 71926d6..20e30f9 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -75,8 +75,10 @@ xfs_fs_set_xstate( flags |= XFS_GQUOTA_ACCT; if (uflags & FS_QUOTA_UDQ_ENFD) flags |= XFS_UQUOTA_ENFD; - if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) - flags |= XFS_OQUOTA_ENFD; + if (uflags & FS_QUOTA_GDQ_ENFD) + flags |= XFS_GQUOTA_ENFD; + if (uflags & FS_QUOTA_PDQ_ENFD) + flags |= XFS_PQUOTA_ENFD; switch (op) { case Q_XQUOTAON: diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 2de58a8..78f9e70 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -618,6 +618,12 @@ xfs_sb_has_incompat_log_feature( return (sbp->sb_features_log_incompat & feature) != 0; } +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino); +} + /* * end of superblock version macros */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ea341ce..1d68ffc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -51,6 +51,7 @@ #include "xfs_inode_item.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_icreate_item.h" #include <linux/namei.h> #include <linux/init.h> @@ -359,17 +360,17 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_PQUOTA) || !strcmp(this_char, MNTOPT_PRJQUOTA)) { mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_OQUOTA_ENFD); + XFS_PQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_OQUOTA_ENFD; + mp->m_qflags &= ~XFS_PQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_GQUOTA) || !strcmp(this_char, MNTOPT_GRPQUOTA)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_OQUOTA_ENFD); + XFS_GQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - mp->m_qflags &= ~XFS_OQUOTA_ENFD; + mp->m_qflags &= ~XFS_GQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { xfs_warn(mp, "delaylog is the default now, option is deprecated."); @@ -439,20 +440,15 @@ xfs_parseargs( } done: - if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { + if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { /* * At this point the superblock has not been read * in, therefore we do not know the block size. * Before the mount call ends we will convert * these to FSBs. */ - if (dsunit) { - mp->m_dalign = dsunit; - mp->m_flags |= XFS_MOUNT_RETERR; - } - - if (dswidth) - mp->m_swidth = dswidth; + mp->m_dalign = dsunit; + mp->m_swidth = dswidth; } if (mp->m_logbufs != -1 && @@ -563,12 +559,12 @@ xfs_showargs( /* Either project or group quotas can be active, not both */ if (mp->m_qflags & XFS_PQUOTA_ACCT) { - if (mp->m_qflags & XFS_OQUOTA_ENFD) + if (mp->m_qflags & XFS_PQUOTA_ENFD) seq_puts(m, "," MNTOPT_PRJQUOTA); else seq_puts(m, "," MNTOPT_PQUOTANOENF); } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { - if (mp->m_qflags & XFS_OQUOTA_ENFD) + if (mp->m_qflags & XFS_GQUOTA_ENFD) seq_puts(m, "," MNTOPT_GRPQUOTA); else seq_puts(m, "," MNTOPT_GQUOTANOENF); @@ -1136,8 +1132,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == - (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); return 0; } @@ -1373,6 +1369,17 @@ xfs_finish_flags( } /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return XFS_ERROR(EINVAL); + } + + /* * mkfs'ed attr2 will turn on attr2 mount unless explicitly * told by noattr2 to turn it off */ @@ -1470,6 +1477,10 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); + /* version 5 superblocks support inode version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + sb->s_flags |= MS_I_VERSION; + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1644,9 +1655,15 @@ xfs_init_zones(void) KM_ZONE_SPREAD, NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; + xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), + "xfs_icr"); + if (!xfs_icreate_zone) + goto out_destroy_ili_zone; return 0; + out_destroy_ili_zone: + kmem_zone_destroy(xfs_ili_zone); out_destroy_inode_zone: kmem_zone_destroy(xfs_inode_zone); out_destroy_efi_zone: @@ -1685,6 +1702,7 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_icreate_zone); kmem_zone_destroy(xfs_ili_zone); kmem_zone_destroy(xfs_inode_zone); kmem_zone_destroy(xfs_efi_zone); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5f234389..e830fb56 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -56,16 +56,9 @@ xfs_symlink_blocks( struct xfs_mount *mp, int pathlen) { - int fsblocks = 0; - int len = pathlen; + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - do { - fsblocks++; - len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - } while (len > 0); - - ASSERT(fsblocks <= XFS_SYMLINK_MAPS); - return fsblocks; + return (pathlen + buflen - 1) / buflen; } static int @@ -365,7 +358,8 @@ xfs_symlink( int n; xfs_buf_t *bp; prid_t prid; - struct xfs_dquot *udqp, *gdqp; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; uint resblks; *ipp = NULL; @@ -405,7 +399,7 @@ xfs_symlink( if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) fs_blocks = 0; else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); + fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); @@ -512,7 +506,7 @@ xfs_symlink( cur_chunk = target_path; offset = 0; for (n = 0; n < nmaps; n++) { - char *buf; + char *buf; d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); @@ -525,9 +519,7 @@ xfs_symlink( bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } + byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, @@ -542,6 +534,7 @@ xfs_symlink( xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - (char *)bp->b_addr); } + ASSERT(pathlen == 0); } /* @@ -593,7 +586,7 @@ xfs_symlink( /* * Free a symlink that has blocks associated with it. */ -int +STATIC int xfs_inactive_symlink_rmt( xfs_inode_t *ip, xfs_trans_t **tpp) @@ -614,7 +607,7 @@ xfs_inactive_symlink_rmt( tp = *tpp; mp = ip->i_mount; - ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); + ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS); /* * We're freeing a symlink that has some * blocks allocated to it. Free the @@ -728,3 +721,47 @@ xfs_inactive_symlink_rmt( error0: return error; } + +/* + * xfs_inactive_symlink - free a symlink + */ +int +xfs_inactive_symlink( + struct xfs_inode *ip, + struct xfs_trans **tp) +{ + struct xfs_mount *mp = ip->i_mount; + int pathlen; + + trace_xfs_inactive_symlink(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Zero length symlinks _can_ exist. + */ + pathlen = (int)ip->i_d.di_size; + if (!pathlen) + return 0; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", + __func__, (unsigned long long)ip->i_ino, pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { + if (ip->i_df.if_bytes > 0) + xfs_idata_realloc(ip, -(ip->i_df.if_bytes), + XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + return 0; + } + + /* remove the remote symlink */ + return xfs_inactive_symlink_rmt(ip, tp); +} diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index b39398d..3743948 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -60,7 +60,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_readlink(struct xfs_inode *ip, char *link); -int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp); +int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp); #endif /* __KERNEL__ */ #endif /* __XFS_SYMLINK_H */ diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 2801b5c..1743b9f 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header; #ifdef CONFIG_PROC_FS STATIC int xfs_stats_clear_proc_handler( - ctl_table *ctl, - int write, - void __user *buffer, - size_t *lenp, - loff_t *ppos) + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) { int c, ret, *valp = ctl->data; __uint32_t vn_active; @@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler( STATIC int xfs_panic_mask_proc_handler( - ctl_table *ctl, - int write, - void __user *buffer, - size_t *lenp, - loff_t *ppos) + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) { int ret, *valp = ctl->data; @@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -static ctl_table xfs_table[] = { +static struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, @@ -227,7 +227,7 @@ static ctl_table xfs_table[] = { {} }; -static ctl_table xfs_dir_table[] = { +static struct ctl_table xfs_dir_table[] = { { .procname = "xfs", .mode = 0555, @@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = { {} }; -static ctl_table xfs_root_table[] = { +static struct ctl_table xfs_root_table[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index aa4db33..47910e6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \ TP_PROTO(struct xfs_buf_log_item *bip), \ TP_ARGS(bip)) DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); @@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); +DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); DECLARE_EVENT_CLASS(xfs_lock_class, TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, @@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss); DEFINE_INODE_EVENT(xfs_getattr); DEFINE_INODE_EVENT(xfs_setattr); DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_readdir); @@ -974,14 +979,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read); DEFINE_RW_EVENT(xfs_file_splice_write); DECLARE_EVENT_CLASS(xfs_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), - TP_ARGS(inode, page, off), + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, + unsigned int len), + TP_ARGS(inode, page, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(pgoff_t, pgoff) __field(loff_t, size) __field(unsigned long, offset) + __field(unsigned int, length) __field(int, delalloc) __field(int, unwritten) ), @@ -995,24 +1002,27 @@ DECLARE_EVENT_CLASS(xfs_page_class, __entry->pgoff = page_offset(page); __entry->size = i_size_read(inode); __entry->offset = off; + __entry->length = len; __entry->delalloc = delalloc; __entry->unwritten = unwritten; ), TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " - "delalloc %d unwritten %d", + "length %x delalloc %d unwritten %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pgoff, __entry->size, __entry->offset, + __entry->length, __entry->delalloc, __entry->unwritten) ) #define DEFINE_PAGE_EVENT(name) \ DEFINE_EVENT(xfs_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ - TP_ARGS(inode, page, off)) + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ + unsigned int len), \ + TP_ARGS(inode, page, off, len)) DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2fd7c1f..35a2299 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -234,71 +234,93 @@ xfs_calc_remove_reservation( } /* - * For symlink we can modify: + * For create, break it in to the two cases that the transaction + * covers. We start with the modify case - allocation done by modification + * of the state of existing inodes - and the allocation case. + */ + +/* + * For create we can modify: * the parent directory inode: inode size * the new inode: inode size - * the inode btree entry: 1 block + * the inode btree entry: block size + * the superblock for the nlink flag: sector size * the directory btree: (max depth + v2) * dir block size * the directory inode's bmap btree: (max depth + v2) * block size - * the blocks for the symlink: 1 kB - * Or in the first xact we allocate some inodes giving: + */ +STATIC uint +xfs_calc_create_resv_modify( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); +} + +/* + * For create we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * the allocation btrees: 2 trees * (max depth - 1) * block size */ STATIC uint -xfs_calc_symlink_reservation( +xfs_calc_create_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +__xfs_calc_create_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(1, 1024)), - (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(mp->m_in_maxlevels, - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)))); + MAX(xfs_calc_create_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); } /* - * For create we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: block size - * the superblock for the nlink flag: sector size - * the directory btree: (max depth + v2) * dir block size - * the directory inode's bmap btree: (max depth + v2) * block size - * Or in the first xact we allocate some inodes giving: + * For icreate we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size */ STATIC uint -xfs_calc_create_reservation( +xfs_calc_icreate_resv_alloc( struct xfs_mount *mp) { + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +xfs_calc_icreate_reservation(xfs_mount_t *mp) +{ return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - (uint)XFS_FSB_TO_B(mp, 1) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - mp->m_sb.sb_sectsize + - xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(mp->m_in_maxlevels, - XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)))); + MAX(xfs_calc_icreate_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +STATIC uint +xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return xfs_calc_icreate_reservation(mp); + return __xfs_calc_create_reservation(mp); + } /* @@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation( return xfs_calc_create_reservation(mp); } + +/* + * Making a new symplink is the same as creating a new file, but + * with the added blocks for remote symlink data which can be up to 1kB in + * length (MAXPATHLEN). + */ +STATIC uint +xfs_calc_symlink_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp) + + xfs_calc_buf_res(1, MAXPATHLEN); +} + /* * In freeing an inode we can modify: * the inode being freed: inode size diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a44dba5..2b49463 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -48,6 +48,7 @@ typedef struct xfs_trans_header { #define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_ICREATE 0x123f #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -107,7 +108,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_SWAPEXT 40 #define XFS_TRANS_SB_COUNT 41 #define XFS_TRANS_CHECKPOINT 42 -#define XFS_TRANS_TYPE_MAX 42 +#define XFS_TRANS_ICREATE 43 +#define XFS_TRANS_TYPE_MAX 43 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -210,23 +212,18 @@ struct xfs_log_item_desc { /* * Per-extent log reservation for the allocation btree changes * involved in freeing or allocating an extent. - * 2 trees * (2 blocks/level * max depth - 1) * block size + * 2 trees * (2 blocks/level * max depth - 1) */ -#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) #define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) /* * Per-directory log reservation for any directory change. - * dir blocks: (1 btree block per level + data block + free block) * dblock size - * bmap btree: (levels + 2) * max depth * block size + * dir blocks: (1 btree block per level + data block + free block) + * bmap btree: (levels + 2) * max depth * v2 directory blocks can be fragmented below the dirblksize down to the fsb * size, so account for that in the DAENTER macros. */ -#define XFS_DIROP_LOG_RES(mp) \ - (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ - (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) #define XFS_DIROP_LOG_COUNT(mp) \ (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) @@ -503,6 +500,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 73a5fa4..aa5a04b 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -397,7 +397,6 @@ shutdown_abort: return XFS_ERROR(EIO); } - /* * Release the buffer bp which was previously acquired with one of the * xfs_trans_... buffer allocation routines if the buffer has not @@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; - bip->bli_flags |= XFS_BLI_LOGGED; - xfs_buf_item_log(bip, first, last); + + /* + * If we have an ordered buffer we are not logging any dirty range but + * it still needs to be marked dirty and that it has been logged. + */ + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; + if (!(bip->bli_flags & XFS_BLI_ORDERED)) + xfs_buf_item_log(bip, first, last); } @@ -757,6 +762,29 @@ xfs_trans_inode_alloc_buf( } /* + * Mark the buffer as ordered for this transaction. This means + * that the contents of the buffer are not recorded in the transaction + * but it is tracked in the AIL as though it was. This allows us + * to record logical changes in transactions rather than the physical + * changes we make to the buffer without changing writeback ordering + * constraints of metadata buffers. + */ +void +xfs_trans_ordered_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_ORDERED; + trace_xfs_buf_item_ordered(bip); +} + +/* * Set the type of the buffer for log recovery so that it can correctly identify * and hence attach the correct buffer ops to the buffer after replay. */ diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index fec75d0..3ba64d5 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo( return; xfs_trans_alloc_dqinfo(ntp); - oqa = otp->t_dqinfo->dqa_usrdquots; - nqa = ntp->t_dqinfo->dqa_usrdquots; /* * Because the quota blk reservation is carried forward, @@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo( if(otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; - for (j = 0; j < 2; j++) { + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + oqa = otp->t_dqinfo->dqs[j]; + nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (oqa[i].qt_dquot == NULL) break; @@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo( oq->qt_ino_res = oq->qt_ino_res_used; } - oqa = otp->t_dqinfo->dqa_grpdquots; - nqa = ntp->t_dqinfo->dqa_grpdquots; } } @@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino( if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp) || - ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return; if (tp->t_dqinfo == NULL) @@ -170,16 +167,18 @@ xfs_trans_mod_dquot_byino( (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); } -STATIC xfs_dqtrx_t * +STATIC struct xfs_dqtrx * xfs_trans_get_dqtrx( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { - int i; - xfs_dqtrx_t *qa; + int i; + struct xfs_dqtrx *qa; - qa = XFS_QM_ISUDQ(dqp) ? - tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; + if (XFS_QM_ISUDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; + else + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || @@ -339,12 +338,10 @@ xfs_trans_apply_dquot_deltas( return; ASSERT(tp->t_dqinfo); - qa = tp->t_dqinfo->dqa_usrdquots; - for (j = 0; j < 2; j++) { - if (qa[0].qt_dquot == NULL) { - qa = tp->t_dqinfo->dqa_grpdquots; + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; + if (qa[0].qt_dquot == NULL) continue; - } /* * Lock all of the dquots and join them to the transaction. @@ -495,10 +492,6 @@ xfs_trans_apply_dquot_deltas( ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); } - /* - * Do the group quotas next - */ - qa = tp->t_dqinfo->dqa_grpdquots; } } @@ -521,9 +514,9 @@ xfs_trans_unreserve_and_mod_dquots( if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; - qa = tp->t_dqinfo->dqa_usrdquots; + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; - for (j = 0; j < 2; j++) { for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { qtrx = &qa[i]; /* @@ -565,7 +558,6 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqunlock(dqp); } - qa = tp->t_dqinfo->dqa_grpdquots; } } @@ -640,8 +632,8 @@ xfs_trans_dqresv( if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_core.d_id && ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || - (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && - (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { + (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || + (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { if (nblks > 0) { /* * dquot is locked already. See if we'd go over the @@ -748,15 +740,15 @@ error_return: */ int xfs_trans_reserve_quota_bydquots( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp, - long nblks, - long ninos, - uint flags) + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + long nblks, + long ninos, + uint flags) { - int resvd = 0, error; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; @@ -771,28 +763,24 @@ xfs_trans_reserve_quota_bydquots( (flags & ~XFS_QMOPT_ENOSPC)); if (error) return error; - resvd = 1; } if (gdqp) { error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); - if (error) { - /* - * can't do it, so backout previous reservation - */ - if (resvd) { - flags |= XFS_QMOPT_FORCE_RES; - xfs_trans_dqresv(tp, mp, udqp, - -nblks, -ninos, flags); - } - return error; - } + if (error) + goto unwind_usr; } /* * Didn't change anything critical, so, no need to log */ return 0; + +unwind_usr: + flags |= XFS_QMOPT_FORCE_RES; + if (udqp) + xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); + return error; } @@ -816,8 +804,7 @@ xfs_trans_reserve_quota_nblks( if (XFS_IS_PQUOTA_ON(mp)) flags |= XFS_QMOPT_ENOSPC; - ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); - ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); + ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index ac6d567..53dfe46 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -112,6 +112,17 @@ xfs_trans_log_inode( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + /* + * First time we log the inode in a transaction, bump the inode change + * counter if it is configured for this to occur. + */ + if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + IS_I_VERSION(VFS_I(ip))) { + inode_inc_iversion(VFS_I(ip)); + ip->i_d.di_changecount = VFS_I(ip)->i_version; + flags |= XFS_ILOG_CORE; + } + tp->t_flags |= XFS_TRANS_DIRTY; ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 1501f4f..42c0ef2 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -322,18 +322,9 @@ xfs_inactive( xfs_trans_ijoin(tp, ip, 0); if (S_ISLNK(ip->i_d.di_mode)) { - /* - * Zero length symlinks _can_ exist. - */ - if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) { - error = xfs_inactive_symlink_rmt(ip, &tp); - if (error) - goto out_cancel; - } else if (ip->i_df.if_bytes > 0) { - xfs_idata_realloc(ip, -(ip->i_df.if_bytes), - XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - } + error = xfs_inactive_symlink(ip, &tp); + if (error) + goto out_cancel; } else if (truncate) { ip->i_d.di_size = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1453,7 +1444,7 @@ xfs_free_file_space( xfs_mount_t *mp; int nimap; uint resblks; - uint rounding; + xfs_off_t rounding; int rt; xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; @@ -1482,7 +1473,7 @@ xfs_free_file_space( inode_dio_wait(VFS_I(ip)); } - rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, -1); diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 5163022..38c67c3 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, - xfs_off_t *offset, filldir_t filldir); +int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); |