diff options
Diffstat (limited to 'fs')
288 files changed, 10214 insertions, 7885 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 014c8dd..57ccb75 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -448,7 +448,7 @@ void v9fs_evict_inode(struct inode *inode) struct v9fs_inode *v9inode = V9FS_I(inode); truncate_inode_pages(inode->i_mapping, 0); - end_writeback(inode); + clear_inode(inode); filemap_fdatawrite(inode->i_mapping); #ifdef CONFIG_9P_FSCACHE diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index e95d1b6..0225742 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -33,7 +33,7 @@ config ARCH_BINFMT_ELF_RANDOMIZE_PIE config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y - depends on (FRV || BLACKFIN || (SUPERH32 && !MMU)) + depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) help ELF FDPIC binaries are based on ELF, but allow the individual load segments of a binary to be located in memory independently of each diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 88a4b0b..8bc4a59 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -264,7 +264,7 @@ affs_evict_inode(struct inode *inode) } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); affs_free_prealloc(inode); cache_page = (unsigned long)AFFS_I(inode)->i_lc; if (cache_page) { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index d890ae3..95cffd3 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -423,7 +423,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); afs_give_up_callback(vnode); @@ -1456,6 +1456,10 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) if (ret < 0) goto out; + ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); + if (ret < 0) + goto out; + kiocb->ki_nr_segs = kiocb->ki_nbytes; kiocb->ki_cur_seg = 0; /* ki_nbytes/left now reflect bytes instead of segs */ @@ -1467,11 +1471,17 @@ out: return ret; } -static ssize_t aio_setup_single_vector(struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) { + int bytes; + + bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); + if (bytes < 0) + return bytes; + kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = kiocb->ki_left; + kiocb->ki_iovec->iov_len = bytes; kiocb->ki_nr_segs = 1; kiocb->ki_cur_seg = 0; return 0; @@ -1496,10 +1506,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, kiocb->ki_left))) break; - ret = security_file_permission(file, MAY_READ); - if (unlikely(ret)) - break; - ret = aio_setup_single_vector(kiocb); + ret = aio_setup_single_vector(READ, file, kiocb); if (ret) break; ret = -EINVAL; @@ -1514,10 +1521,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, kiocb->ki_left))) break; - ret = security_file_permission(file, MAY_WRITE); - if (unlikely(ret)) - break; - ret = aio_setup_single_vector(kiocb); + ret = aio_setup_single_vector(WRITE, file, kiocb); if (ret) break; ret = -EINVAL; @@ -1528,9 +1532,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) break; - ret = security_file_permission(file, MAY_READ); - if (unlikely(ret)) - break; ret = aio_setup_vectored_rw(READ, kiocb, compat); if (ret) break; @@ -1542,9 +1543,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) break; - ret = security_file_permission(file, MAY_WRITE); - if (unlikely(ret)) - break; ret = aio_setup_vectored_rw(WRITE, kiocb, compat); if (ret) break; @@ -47,14 +47,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && - (current_fsuid() != inode->i_uid || - attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN)) + (!uid_eq(current_fsuid(), inode->i_uid) || + !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && - (current_fsuid() != inode->i_uid || - (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) && + (!uid_eq(current_fsuid(), inode->i_uid) || + (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && !capable(CAP_CHOWN)) return -EPERM; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 6e488eb..8a4fed8 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -100,7 +100,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) static void autofs4_evict_inode(struct inode *inode) { - end_writeback(inode); + clear_inode(inode); kfree(inode->i_private); } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index e23dc7c..9870417 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -174,7 +174,7 @@ static void bfs_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (inode->i_nlink) return; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 16f7354..e658dd1 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -226,10 +226,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec->e_entry); - NEW_AUX_ENT(AT_UID, cred->uid); - NEW_AUX_ENT(AT_EUID, cred->euid); - NEW_AUX_ENT(AT_GID, cred->gid); - NEW_AUX_ENT(AT_EGID, cred->egid); + NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); + NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); + NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); + NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); NEW_AUX_ENT(AT_EXECFN, bprm->exec); @@ -1356,8 +1356,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_flag = p->flags; rcu_read_lock(); cred = __task_cred(p); - SET_UID(psinfo->pr_uid, cred->uid); - SET_GID(psinfo->pr_gid, cred->gid); + SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); + SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d390a0f..3d77cf8 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -627,10 +627,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr); NEW_AUX_ENT(AT_FLAGS, 0); NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr); - NEW_AUX_ENT(AT_UID, (elf_addr_t) cred->uid); - NEW_AUX_ENT(AT_EUID, (elf_addr_t) cred->euid); - NEW_AUX_ENT(AT_GID, (elf_addr_t) cred->gid); - NEW_AUX_ENT(AT_EGID, (elf_addr_t) cred->egid); + NEW_AUX_ENT(AT_UID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid)); + NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid)); + NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid)); + NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); NEW_AUX_ENT(AT_EXECFN, bprm->exec); @@ -1421,8 +1421,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_flag = p->flags; rcu_read_lock(); cred = __task_cred(p); - SET_UID(psinfo->pr_uid, cred->uid); - SET_GID(psinfo->pr_gid, cred->gid); + SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); + SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 613aa06..790b3cd 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -505,7 +505,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) static void bm_evict_inode(struct inode *inode) { - end_writeback(inode); + clear_inode(inode); kfree(inode->i_private); } @@ -505,9 +505,14 @@ EXPORT_SYMBOL(bio_clone); int bio_get_nr_vecs(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - return min_t(unsigned, + int nr_pages; + + nr_pages = min_t(unsigned, queue_max_segments(q), queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); + + return min_t(unsigned, nr_pages, BIO_MAX_PAGES); + } EXPORT_SYMBOL(bio_get_nr_vecs); diff --git a/fs/block_dev.c b/fs/block_dev.c index e08f6a20..c2bbe1f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -70,7 +70,7 @@ static void bdev_inode_switch_bdi(struct inode *inode, spin_unlock(&dst->wb.list_lock); } -static sector_t max_block(struct block_device *bdev) +sector_t blkdev_max_block(struct block_device *bdev) { sector_t retval = ~((sector_t)0); loff_t sz = i_size_read(bdev->bd_inode); @@ -163,7 +163,7 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - if (iblock >= max_block(I_BDEV(inode))) { + if (iblock >= blkdev_max_block(I_BDEV(inode))) { if (create) return -EIO; @@ -185,7 +185,7 @@ static int blkdev_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { - sector_t end_block = max_block(I_BDEV(inode)); + sector_t end_block = blkdev_max_block(I_BDEV(inode)); unsigned long max_blocks = bh->b_size >> inode->i_blkbits; if ((iblock + max_blocks) > end_block) { @@ -487,7 +487,7 @@ static void bdev_evict_inode(struct inode *inode) struct list_head *p; truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); /* is it needed here? */ - end_writeback(inode); + clear_inode(inode); spin_lock(&bdev_lock); while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { __bd_forget(list_entry(p, struct inode, i_devices)); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a7ffc88..e1fe74a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2753,7 +2753,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) * one reference for us, and we leave it for the * caller */ - device->flush_bio = NULL;; + device->flush_bio = NULL; bio = bio_alloc(GFP_NOFS, 0); if (!bio) return -ENOMEM; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 61b16c6..ceb7b9c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3756,7 +3756,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); no_delete: - end_writeback(inode); + clear_inode(inode); return; } diff --git a/fs/buffer.c b/fs/buffer.c index 351e18e..ad5938c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -921,6 +921,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); do { if (!buffer_mapped(bh)) { @@ -929,7 +930,8 @@ init_page_buffers(struct page *page, struct block_device *bdev, bh->b_blocknr = block; if (uptodate) set_buffer_uptodate(bh); - set_buffer_mapped(bh); + if (block < end_block) + set_buffer_mapped(bh); } block++; bh = bh->b_this_page; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 3c9c794..8b6e344 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -271,7 +271,7 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); cifs_fscache_release_inode_cookie(inode); } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2870597..f181312 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -244,7 +244,7 @@ static void coda_put_super(struct super_block *sb) static void coda_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); coda_cache_clear_inode(inode); } diff --git a/fs/compat.c b/fs/compat.c index f2944ac..0781e61 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -144,8 +144,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = old_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; diff --git a/fs/dcache.c b/fs/dcache.c index b80531c..4435d8b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -153,16 +153,12 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ -static inline int dentry_cmp(const unsigned char *cs, size_t scount, - const unsigned char *ct, size_t tcount) +static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; - if (unlikely(scount != tcount)) - return 1; - for (;;) { - a = load_unaligned_zeropad(cs); + a = *(unsigned long *)cs; b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; @@ -180,12 +176,8 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount, #else -static inline int dentry_cmp(const unsigned char *cs, size_t scount, - const unsigned char *ct, size_t tcount) +static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { - if (scount != tcount) - return 1; - do { if (*cs != *ct) return 1; @@ -198,6 +190,30 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount, #endif +static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) +{ + const unsigned char *cs; + /* + * Be careful about RCU walk racing with rename: + * use ACCESS_ONCE to fetch the name pointer. + * + * NOTE! Even if a rename will mean that the length + * was not loaded atomically, we don't care. The + * RCU walk will check the sequence count eventually, + * and catch it. And we won't overrun the buffer, + * because we're reading the name pointer atomically, + * and a dentry name is guaranteed to be properly + * terminated with a NUL byte. + * + * End result: even if 'len' is wrong, we'll exit + * early because the data cannot match (there can + * be no NUL in the ct/tcount data) + */ + cs = ACCESS_ONCE(dentry->d_name.name); + smp_read_barrier_depends(); + return dentry_string_cmp(cs, ct, tcount); +} + static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); @@ -1258,6 +1274,13 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) if (!dentry) return NULL; + /* + * We guarantee that the inline name is always NUL-terminated. + * This way the memcpy() done by the name switching in rename + * will still always have a NUL at the end, even if we might + * be overwriting an internal NUL character + */ + dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (name->len > DNAME_INLINE_LEN-1) { dname = kmalloc(name->len + 1, GFP_KERNEL); if (!dname) { @@ -1267,13 +1290,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } else { dname = dentry->d_iname; } - dentry->d_name.name = dname; dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; + /* Make sure we always see the terminating NUL character */ + smp_wmb(); + dentry->d_name.name = dname; + dentry->d_count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); @@ -1439,18 +1465,18 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, } list_for_each_entry(alias, &inode->i_dentry, d_alias) { - struct qstr *qstr = &alias->d_name; - /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or * parent changes, because the parent inode i_mutex is held. */ - if (qstr->hash != hash) + if (alias->d_name.hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; - if (dentry_cmp(qstr->name, qstr->len, name, len)) + if (alias->d_name.len != len) + continue; + if (dentry_cmp(alias, name, len)) continue; __dget(alias); return alias; @@ -1489,7 +1515,7 @@ struct dentry *d_make_root(struct inode *root_inode) struct dentry *res = NULL; if (root_inode) { - static const struct qstr name = { .name = "/", .len = 1 }; + static const struct qstr name = QSTR_INIT("/", 1); res = __d_alloc(root_inode->i_sb, &name); if (res) @@ -1727,6 +1753,48 @@ err_out: } EXPORT_SYMBOL(d_add_ci); +/* + * Do the slow-case of the dentry name compare. + * + * Unlike the dentry_cmp() function, we need to atomically + * load the name, length and inode information, so that the + * filesystem can rely on them, and can use the 'name' and + * 'len' information without worrying about walking off the + * end of memory etc. + * + * Thus the read_seqcount_retry() and the "duplicate" info + * in arguments (the low-level filesystem should not look + * at the dentry inode or name contents directly, since + * rename can change them while we're in RCU mode). + */ +enum slow_d_compare { + D_COMP_OK, + D_COMP_NOMATCH, + D_COMP_SEQRETRY, +}; + +static noinline enum slow_d_compare slow_dentry_cmp( + const struct dentry *parent, + struct inode *inode, + struct dentry *dentry, + unsigned int seq, + const struct qstr *name) +{ + int tlen = dentry->d_name.len; + const char *tname = dentry->d_name.name; + struct inode *i = dentry->d_inode; + + if (read_seqcount_retry(&dentry->d_seq, seq)) { + cpu_relax(); + return D_COMP_SEQRETRY; + } + if (parent->d_op->d_compare(parent, inode, + dentry, i, + tlen, tname, name)) + return D_COMP_NOMATCH; + return D_COMP_OK; +} + /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry @@ -1753,15 +1821,17 @@ EXPORT_SYMBOL(d_add_ci); * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. + * + * NOTE! The caller *has* to check the resulting dentry against the sequence + * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, - unsigned *seqp, struct inode **inode) + unsigned *seqp, struct inode *inode) { - unsigned int len = name->len; - unsigned int hash = name->hash; + u64 hashlen = name->hash_len; const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(parent, hash); + struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen)); struct hlist_bl_node *node; struct dentry *dentry; @@ -1787,49 +1857,47 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; - struct inode *i; - const char *tname; - int tlen; - - if (dentry->d_name.hash != hash) - continue; seqretry: - seq = read_seqcount_begin(&dentry->d_seq); + /* + * The dentry sequence count protects us from concurrent + * renames, and thus protects inode, parent and name fields. + * + * The caller must perform a seqcount check in order + * to do anything useful with the returned dentry, + * including using the 'd_inode' pointer. + * + * NOTE! We do a "raw" seqcount_begin here. That means that + * we don't wait for the sequence count to stabilize if it + * is in the middle of a sequence change. If we do the slow + * dentry compare, we will do seqretries until it is stable, + * and if we end up with a successful lookup, we actually + * want to exit RCU lookup anyway. + */ + seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; - tlen = dentry->d_name.len; - tname = dentry->d_name.name; - i = dentry->d_inode; - prefetch(tname); - /* - * This seqcount check is required to ensure name and - * len are loaded atomically, so as not to walk off the - * edge of memory when walking. If we could load this - * atomically some other way, we could drop this check. - */ - if (read_seqcount_retry(&dentry->d_seq, seq)) - goto seqretry; + *seqp = seq; + if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { - if (parent->d_op->d_compare(parent, *inode, - dentry, i, - tlen, tname, name)) + if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; - } else { - if (dentry_cmp(tname, tlen, str, len)) + switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) { + case D_COMP_OK: + return dentry; + case D_COMP_NOMATCH: continue; + default: + goto seqretry; + } } - /* - * No extra seqcount check is required after the name - * compare. The caller must perform a seqcount check in - * order to do anything useful with the returned dentry - * anyway. - */ - *seqp = seq; - *inode = i; - return dentry; + + if (dentry->d_name.hash_len != hashlen) + continue; + if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) + return dentry; } return NULL; } @@ -1908,8 +1976,6 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { - const char *tname; - int tlen; if (dentry->d_name.hash != hash) continue; @@ -1924,15 +1990,17 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name) * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). */ - tlen = dentry->d_name.len; - tname = dentry->d_name.name; if (parent->d_flags & DCACHE_OP_COMPARE) { + int tlen = dentry->d_name.len; + const char *tname = dentry->d_name.name; if (parent->d_op->d_compare(parent, parent->d_inode, dentry, dentry->d_inode, tlen, tname, name)) goto next; } else { - if (dentry_cmp(tname, tlen, str, len)) + if (dentry->d_name.len != len) + goto next; + if (dentry_cmp(dentry, str, len)) goto next; } @@ -3025,6 +3093,7 @@ static void __init dcache_init_early(void) HASH_EARLY, &d_hash_shift, &d_hash_mask, + 0, 0); for (loop = 0; loop < (1U << d_hash_shift); loop++) @@ -3055,6 +3124,7 @@ static void __init dcache_init(void) 0, &d_hash_shift, &d_hash_mask, + 0, 0); for (loop = 0; loop < (1U << d_hash_shift); loop++) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 5dfafdd..2340f69 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -20,6 +20,7 @@ #include <linux/namei.h> #include <linux/debugfs.h> #include <linux/io.h> +#include <linux/slab.h> static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -520,6 +521,133 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_blob); +struct array_data { + void *array; + u32 elements; +}; + +static int u32_array_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return nonseekable_open(inode, file); +} + +static size_t format_array(char *buf, size_t bufsize, const char *fmt, + u32 *array, u32 array_size) +{ + size_t ret = 0; + u32 i; + + for (i = 0; i < array_size; i++) { + size_t len; + + len = snprintf(buf, bufsize, fmt, array[i]); + len++; /* ' ' or '\n' */ + ret += len; + + if (buf) { + buf += len; + bufsize -= len; + buf[-1] = (i == array_size-1) ? '\n' : ' '; + } + } + + ret++; /* \0 */ + if (buf) + *buf = '\0'; + + return ret; +} + +static char *format_array_alloc(const char *fmt, u32 *array, + u32 array_size) +{ + size_t len = format_array(NULL, 0, fmt, array, array_size); + char *ret; + + ret = kmalloc(len, GFP_KERNEL); + if (ret == NULL) + return NULL; + + format_array(ret, len, fmt, array, array_size); + return ret; +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct array_data *data = inode->i_private; + size_t size; + + if (*ppos == 0) { + if (file->private_data) { + kfree(file->private_data); + file->private_data = NULL; + } + + file->private_data = format_array_alloc("%u", data->array, + data->elements); + } + + size = 0; + if (file->private_data) + size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, + file->private_data, size); +} + +static int u32_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static const struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release = u32_array_release, + .read = u32_array_read, + .llseek = no_llseek, +}; + +/** + * debugfs_create_u32_array - create a debugfs file that is used to read u32 + * array. + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @array: u32 array that provides data. + * @elements: total number of elements in the array. + * + * This function creates a file in debugfs with the given name that exports + * @array as data. If the @mode variable is so set it can be read from. + * Writing is not supported. Seek within the file is also not supported. + * Once array is created its size can not be changed. + * + * The function returns a pointer to dentry on success. If debugfs is not + * enabled in the kernel, the value -%ENODEV will be returned. + */ +struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, + struct dentry *parent, + u32 *array, u32 elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} +EXPORT_SYMBOL_GPL(debugfs_create_u32_array); + #ifdef CONFIG_HAS_IOMEM /* diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 10f5e0b..979c1e3 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -98,8 +98,8 @@ static struct vfsmount *devpts_mnt; struct pts_mount_opts { int setuid; int setgid; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; umode_t mode; umode_t ptmxmode; int newinstance; @@ -158,11 +158,13 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) { char *p; + kuid_t uid; + kgid_t gid; opts->setuid = 0; opts->setgid = 0; - opts->uid = 0; - opts->gid = 0; + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; opts->mode = DEVPTS_DEFAULT_MODE; opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; opts->max = NR_UNIX98_PTY_MAX; @@ -184,13 +186,19 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; - opts->uid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) + return -EINVAL; + opts->uid = uid; opts->setuid = 1; break; case Opt_gid: if (match_int(&args[0], &option)) return -EINVAL; - opts->gid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) + return -EINVAL; + opts->gid = gid; opts->setgid = 1; break; case Opt_mode: @@ -315,9 +323,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) struct pts_mount_opts *opts = &fsi->mount_opts; if (opts->setuid) - seq_printf(seq, ",uid=%u", opts->uid); + seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid)); if (opts->setgid) - seq_printf(seq, ",gid=%u", opts->gid); + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 90e5997..63dc19c 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -310,6 +310,7 @@ void dlm_callback_resume(struct dlm_ls *ls) } mutex_unlock(&ls->ls_cb_mutex); - log_debug(ls, "dlm_callback_resume %d", count); + if (count) + log_debug(ls, "dlm_callback_resume %d", count); } diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 3a564d1..bc342f7 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -38,6 +38,7 @@ #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/idr.h> +#include <linux/ratelimit.h> #include <asm/uaccess.h> #include <linux/dlm.h> @@ -74,6 +75,13 @@ do { \ (ls)->ls_name , ##args); \ } while (0) +#define log_limit(ls, fmt, args...) \ +do { \ + if (dlm_config.ci_log_debug) \ + printk_ratelimited(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) + #define DLM_ASSERT(x, do) \ { \ if (!(x)) \ @@ -263,6 +271,8 @@ struct dlm_lkb { ktime_t lkb_last_cast_time; /* for debugging */ ktime_t lkb_last_bast_time; /* for debugging */ + uint64_t lkb_recover_seq; /* from ls_recover_seq */ + char *lkb_lvbptr; struct dlm_lksb *lkb_lksb; /* caller's status block */ void (*lkb_astfn) (void *astparam); @@ -317,7 +327,7 @@ enum rsb_flags { RSB_NEW_MASTER, RSB_NEW_MASTER2, RSB_RECOVER_CONVERT, - RSB_LOCKS_PURGED, + RSB_RECOVER_GRANT, }; static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) @@ -563,6 +573,7 @@ struct dlm_ls { struct mutex ls_requestqueue_mutex; struct dlm_rcom *ls_recover_buf; int ls_recover_nodeid; /* for debugging */ + unsigned int ls_recover_locks_in; /* for log info */ uint64_t ls_rcom_seq; spinlock_t ls_rcom_spin; struct list_head ls_recover_list; @@ -589,6 +600,7 @@ struct dlm_ls { #define LSFL_UEVENT_WAIT 5 #define LSFL_TIMEWARN 6 #define LSFL_CB_DELAY 7 +#define LSFL_NODIR 8 /* much of this is just saving user space pointers associated with the lock that we pass back to the user lib with an ast */ @@ -636,7 +648,7 @@ static inline int dlm_recovery_stopped(struct dlm_ls *ls) static inline int dlm_no_directory(struct dlm_ls *ls) { - return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; + return test_bit(LSFL_NODIR, &ls->ls_flags); } int dlm_netlink_init(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 4c58d4a..bdafb65 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -160,11 +160,12 @@ static const int __quecvt_compat_matrix[8][8] = { void dlm_print_lkb(struct dlm_lkb *lkb) { - printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n" - " status %d rqmode %d grmode %d wait_type %d\n", + printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x " + "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n", lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, - lkb->lkb_grmode, lkb->lkb_wait_type); + lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid, + (unsigned long long)lkb->lkb_recover_seq); } static void dlm_print_rsb(struct dlm_rsb *r) @@ -251,8 +252,6 @@ static inline int is_process_copy(struct dlm_lkb *lkb) static inline int is_master_copy(struct dlm_lkb *lkb) { - if (lkb->lkb_flags & DLM_IFL_MSTCPY) - DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb);); return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0; } @@ -479,6 +478,9 @@ static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, kref_get(&r->res_ref); goto out; } + if (error == -ENOTBLK) + goto out; + error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); if (error) goto out; @@ -586,6 +588,23 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, return error; } +static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) +{ + struct rb_node *n; + struct dlm_rsb *r; + int i; + + for (i = 0; i < ls->ls_rsbtbl_size; i++) { + spin_lock(&ls->ls_rsbtbl[i].lock); + for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) { + r = rb_entry(n, struct dlm_rsb, res_hashnode); + if (r->res_hash == hash) + dlm_dump_rsb(r); + } + spin_unlock(&ls->ls_rsbtbl[i].lock); + } +} + /* This is only called to add a reference when the code already holds a valid reference to the rsb, so there's no need for locking. */ @@ -1064,8 +1083,9 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, goto out_del; } - log_error(ls, "remwait error %x reply %d flags %x no wait_type", - lkb->lkb_id, mstype, lkb->lkb_flags); + log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait", + lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid, + mstype, lkb->lkb_flags); return -1; out_del: @@ -1498,13 +1518,13 @@ static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } lkb->lkb_rqmode = DLM_LOCK_IV; + lkb->lkb_highbast = 0; } static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) { set_lvb_lock(r, lkb); _grant_lock(r, lkb); - lkb->lkb_highbast = 0; } static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, @@ -1866,7 +1886,8 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, /* Returns the highest requested mode of all blocked conversions; sets cw if there's a blocked conversion to DLM_LOCK_CW. */ -static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw) +static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw, + unsigned int *count) { struct dlm_lkb *lkb, *s; int hi, demoted, quit, grant_restart, demote_restart; @@ -1885,6 +1906,8 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw) if (can_be_granted(r, lkb, 0, &deadlk)) { grant_lock_pending(r, lkb); grant_restart = 1; + if (count) + (*count)++; continue; } @@ -1918,14 +1941,17 @@ static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw) return max_t(int, high, hi); } -static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw) +static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw, + unsigned int *count) { struct dlm_lkb *lkb, *s; list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { - if (can_be_granted(r, lkb, 0, NULL)) + if (can_be_granted(r, lkb, 0, NULL)) { grant_lock_pending(r, lkb); - else { + if (count) + (*count)++; + } else { high = max_t(int, lkb->lkb_rqmode, high); if (lkb->lkb_rqmode == DLM_LOCK_CW) *cw = 1; @@ -1954,16 +1980,20 @@ static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw) return 0; } -static void grant_pending_locks(struct dlm_rsb *r) +static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count) { struct dlm_lkb *lkb, *s; int high = DLM_LOCK_IV; int cw = 0; - DLM_ASSERT(is_master(r), dlm_dump_rsb(r);); + if (!is_master(r)) { + log_print("grant_pending_locks r nodeid %d", r->res_nodeid); + dlm_dump_rsb(r); + return; + } - high = grant_pending_convert(r, high, &cw); - high = grant_pending_wait(r, high, &cw); + high = grant_pending_convert(r, high, &cw, count); + high = grant_pending_wait(r, high, &cw, count); if (high == DLM_LOCK_IV) return; @@ -2499,7 +2529,7 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) before we try again to grant this one. */ if (is_demoted(lkb)) { - grant_pending_convert(r, DLM_LOCK_IV, NULL); + grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL); if (_can_be_granted(r, lkb, 1)) { grant_lock(r, lkb); queue_cast(r, lkb, 0); @@ -2527,7 +2557,7 @@ static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, { switch (error) { case 0: - grant_pending_locks(r); + grant_pending_locks(r, NULL); /* grant_pending_locks also sends basts */ break; case -EAGAIN: @@ -2550,7 +2580,7 @@ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, int error) { - grant_pending_locks(r); + grant_pending_locks(r, NULL); } /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ @@ -2571,7 +2601,7 @@ static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, int error) { if (error) - grant_pending_locks(r); + grant_pending_locks(r, NULL); } /* @@ -3372,7 +3402,7 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) return error; } -static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3412,14 +3442,15 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) error = 0; if (error) dlm_put_lkb(lkb); - return; + return 0; fail: setup_stub_lkb(ls, ms); send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; } -static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3429,6 +3460,15 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) if (error) goto fail; + if (lkb->lkb_remid != ms->m_lkid) { + log_error(ls, "receive_convert %x remid %x recover_seq %llu " + "remote %d %x", lkb->lkb_id, lkb->lkb_remid, + (unsigned long long)lkb->lkb_recover_seq, + ms->m_header.h_nodeid, ms->m_lkid); + error = -ENOENT; + goto fail; + } + r = lkb->lkb_resource; hold_rsb(r); @@ -3456,14 +3496,15 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); - return; + return 0; fail: setup_stub_lkb(ls, ms); send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; } -static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3473,6 +3514,14 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) if (error) goto fail; + if (lkb->lkb_remid != ms->m_lkid) { + log_error(ls, "receive_unlock %x remid %x remote %d %x", + lkb->lkb_id, lkb->lkb_remid, + ms->m_header.h_nodeid, ms->m_lkid); + error = -ENOENT; + goto fail; + } + r = lkb->lkb_resource; hold_rsb(r); @@ -3497,14 +3546,15 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); - return; + return 0; fail: setup_stub_lkb(ls, ms); send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; } -static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3532,25 +3582,23 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); - return; + return 0; fail: setup_stub_lkb(ls, ms); send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + return error; } -static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_grant from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; r = lkb->lkb_resource; @@ -3570,20 +3618,18 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); + return 0; } -static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_bast from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; r = lkb->lkb_resource; @@ -3595,10 +3641,12 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) goto out; queue_bast(r, lkb, ms->m_bastmode); + lkb->lkb_highbast = ms->m_bastmode; out: unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); + return 0; } static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) @@ -3653,18 +3701,15 @@ static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) do_purge(ls, ms->m_nodeid, ms->m_pid); } -static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; int error, mstype, result; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_request_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; r = lkb->lkb_resource; hold_rsb(r); @@ -3676,8 +3721,13 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) mstype = lkb->lkb_wait_type; error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); - if (error) + if (error) { + log_error(ls, "receive_request_reply %x remote %d %x result %d", + lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, + ms->m_result); + dlm_dump_rsb(r); goto out; + } /* Optimization: the dir node was also the master, so it took our lookup as a request and sent request reply instead of lookup reply */ @@ -3755,6 +3805,7 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); + return 0; } static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, @@ -3793,8 +3844,11 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, break; default: - log_error(r->res_ls, "receive_convert_reply %x error %d", - lkb->lkb_id, ms->m_result); + log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d", + lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid, + ms->m_result); + dlm_print_rsb(r); + dlm_print_lkb(lkb); } } @@ -3821,20 +3875,18 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) put_rsb(r); } -static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_convert_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; _receive_convert_reply(lkb, ms); dlm_put_lkb(lkb); + return 0; } static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) @@ -3873,20 +3925,18 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) put_rsb(r); } -static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_unlock_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; _receive_unlock_reply(lkb, ms); dlm_put_lkb(lkb); + return 0; } static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) @@ -3925,20 +3975,18 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) put_rsb(r); } -static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; int error; error = find_lkb(ls, ms->m_remid, &lkb); - if (error) { - log_debug(ls, "receive_cancel_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); - return; - } + if (error) + return error; _receive_cancel_reply(lkb, ms); dlm_put_lkb(lkb); + return 0; } static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) @@ -3949,7 +3997,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_lkid, &lkb); if (error) { - log_error(ls, "receive_lookup_reply no lkb"); + log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid); return; } @@ -3993,8 +4041,11 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) dlm_put_lkb(lkb); } -static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) +static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq) { + int error = 0, noent = 0; + if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { log_debug(ls, "ignore non-member message %d from %d %x %x %d", ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, @@ -4007,47 +4058,50 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) /* messages sent to a master node */ case DLM_MSG_REQUEST: - receive_request(ls, ms); + error = receive_request(ls, ms); break; case DLM_MSG_CONVERT: - receive_convert(ls, ms); + error = receive_convert(ls, ms); break; case DLM_MSG_UNLOCK: - receive_unlock(ls, ms); + error = receive_unlock(ls, ms); break; case DLM_MSG_CANCEL: - receive_cancel(ls, ms); + noent = 1; + error = receive_cancel(ls, ms); break; /* messages sent from a master node (replies to above) */ case DLM_MSG_REQUEST_REPLY: - receive_request_reply(ls, ms); + error = receive_request_reply(ls, ms); break; case DLM_MSG_CONVERT_REPLY: - receive_convert_reply(ls, ms); + error = receive_convert_reply(ls, ms); break; case DLM_MSG_UNLOCK_REPLY: - receive_unlock_reply(ls, ms); + error = receive_unlock_reply(ls, ms); break; case DLM_MSG_CANCEL_REPLY: - receive_cancel_reply(ls, ms); + error = receive_cancel_reply(ls, ms); break; /* messages sent from a master node (only two types of async msg) */ case DLM_MSG_GRANT: - receive_grant(ls, ms); + noent = 1; + error = receive_grant(ls, ms); break; case DLM_MSG_BAST: - receive_bast(ls, ms); + noent = 1; + error = receive_bast(ls, ms); break; /* messages sent to a dir node */ @@ -4075,6 +4129,37 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) default: log_error(ls, "unknown message type %d", ms->m_type); } + + /* + * When checking for ENOENT, we're checking the result of + * find_lkb(m_remid): + * + * The lock id referenced in the message wasn't found. This may + * happen in normal usage for the async messages and cancel, so + * only use log_debug for them. + * + * Some errors are expected and normal. + */ + + if (error == -ENOENT && noent) { + log_debug(ls, "receive %d no %x remote %d %x saved_seq %u", + ms->m_type, ms->m_remid, ms->m_header.h_nodeid, + ms->m_lkid, saved_seq); + } else if (error == -ENOENT) { + log_error(ls, "receive %d no %x remote %d %x saved_seq %u", + ms->m_type, ms->m_remid, ms->m_header.h_nodeid, + ms->m_lkid, saved_seq); + + if (ms->m_type == DLM_MSG_CONVERT) + dlm_dump_rsb_hash(ls, ms->m_hash); + } + + if (error == -EINVAL) { + log_error(ls, "receive %d inval from %d lkid %x remid %x " + "saved_seq %u", + ms->m_type, ms->m_header.h_nodeid, + ms->m_lkid, ms->m_remid, saved_seq); + } } /* If the lockspace is in recovery mode (locking stopped), then normal @@ -4092,16 +4177,17 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, dlm_add_requestqueue(ls, nodeid, ms); } else { dlm_wait_requestqueue(ls); - _receive_message(ls, ms); + _receive_message(ls, ms, 0); } } /* This is called by dlm_recoverd to process messages that were saved on the requestqueue. */ -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms) +void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq) { - _receive_message(ls, ms); + _receive_message(ls, ms, saved_seq); } /* This is called by the midcomms layer when something is received for @@ -4137,9 +4223,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) ls = dlm_find_lockspace_global(hd->h_lockspace); if (!ls) { - if (dlm_config.ci_log_debug) - log_print("invalid lockspace %x from %d cmd %d type %d", - hd->h_lockspace, nodeid, hd->h_cmd, type); + if (dlm_config.ci_log_debug) { + printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " + "%u from %d cmd %d type %d\n", + hd->h_lockspace, nodeid, hd->h_cmd, type); + } if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) dlm_send_ls_not_ready(nodeid, &p->rcom); @@ -4187,15 +4275,13 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, /* A waiting lkb needs recovery if the master node has failed, or the master node is changing (only when no directory is used) */ -static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) +static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb, + int dir_nodeid) { - if (dlm_is_removed(ls, lkb->lkb_nodeid)) + if (dlm_no_directory(ls)) return 1; - if (!dlm_no_directory(ls)) - return 0; - - if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid) + if (dlm_is_removed(ls, lkb->lkb_wait_nodeid)) return 1; return 0; @@ -4212,6 +4298,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) struct dlm_lkb *lkb, *safe; struct dlm_message *ms_stub; int wait_type, stub_unlock_result, stub_cancel_result; + int dir_nodeid; ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); if (!ms_stub) { @@ -4223,13 +4310,21 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { + dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource); + /* exclude debug messages about unlocks because there can be so many and they aren't very interesting */ if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { - log_debug(ls, "recover_waiter %x nodeid %d " - "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid, - lkb->lkb_wait_type, lkb->lkb_wait_nodeid); + log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " + "lkb_nodeid %d wait_nodeid %d dir_nodeid %d", + lkb->lkb_id, + lkb->lkb_remid, + lkb->lkb_wait_type, + lkb->lkb_resource->res_nodeid, + lkb->lkb_nodeid, + lkb->lkb_wait_nodeid, + dir_nodeid); } /* all outstanding lookups, regardless of destination will be @@ -4240,7 +4335,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) continue; } - if (!waiter_needs_recovery(ls, lkb)) + if (!waiter_needs_recovery(ls, lkb, dir_nodeid)) continue; wait_type = lkb->lkb_wait_type; @@ -4373,8 +4468,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) ou = is_overlap_unlock(lkb); err = 0; - log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d", - lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid); + log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " + "lkb_nodeid %d wait_nodeid %d dir_nodeid %d " + "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype, + r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid, + dlm_dir_nodeid(r), oc, ou); /* At this point we assume that we won't get a reply to any previous op or overlap op on this lock. First, do a big @@ -4426,9 +4524,12 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) } } - if (err) - log_error(ls, "recover_waiters_post %x %d %x %d %d", - lkb->lkb_id, mstype, lkb->lkb_flags, oc, ou); + if (err) { + log_error(ls, "waiter %x msg %d r_nodeid %d " + "dir_nodeid %d overlap %d %d", + lkb->lkb_id, mstype, r->res_nodeid, + dlm_dir_nodeid(r), oc, ou); + } unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -4437,112 +4538,177 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) return error; } -static void purge_queue(struct dlm_rsb *r, struct list_head *queue, - int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb)) +static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r, + struct list_head *list) { - struct dlm_ls *ls = r->res_ls; struct dlm_lkb *lkb, *safe; - list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) { - if (test(ls, lkb)) { - rsb_set_flag(r, RSB_LOCKS_PURGED); - del_lkb(r, lkb); - /* this put should free the lkb */ - if (!dlm_put_lkb(lkb)) - log_error(ls, "purged lkb not released"); - } + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { + if (!is_master_copy(lkb)) + continue; + + /* don't purge lkbs we've added in recover_master_copy for + the current recovery seq */ + + if (lkb->lkb_recover_seq == ls->ls_recover_seq) + continue; + + del_lkb(r, lkb); + + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged mstcpy lkb not released"); } } -static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb) +void dlm_purge_mstcpy_locks(struct dlm_rsb *r) { - return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid)); -} + struct dlm_ls *ls = r->res_ls; -static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb) -{ - return is_master_copy(lkb); + purge_mstcpy_list(ls, r, &r->res_grantqueue); + purge_mstcpy_list(ls, r, &r->res_convertqueue); + purge_mstcpy_list(ls, r, &r->res_waitqueue); } -static void purge_dead_locks(struct dlm_rsb *r) +static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r, + struct list_head *list, + int nodeid_gone, unsigned int *count) { - purge_queue(r, &r->res_grantqueue, &purge_dead_test); - purge_queue(r, &r->res_convertqueue, &purge_dead_test); - purge_queue(r, &r->res_waitqueue, &purge_dead_test); -} + struct dlm_lkb *lkb, *safe; -void dlm_purge_mstcpy_locks(struct dlm_rsb *r) -{ - purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test); - purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test); - purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test); + list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) { + if (!is_master_copy(lkb)) + continue; + + if ((lkb->lkb_nodeid == nodeid_gone) || + dlm_is_removed(ls, lkb->lkb_nodeid)) { + + del_lkb(r, lkb); + + /* this put should free the lkb */ + if (!dlm_put_lkb(lkb)) + log_error(ls, "purged dead lkb not released"); + + rsb_set_flag(r, RSB_RECOVER_GRANT); + + (*count)++; + } + } } /* Get rid of locks held by nodes that are gone. */ -int dlm_purge_locks(struct dlm_ls *ls) +void dlm_recover_purge(struct dlm_ls *ls) { struct dlm_rsb *r; + struct dlm_member *memb; + int nodes_count = 0; + int nodeid_gone = 0; + unsigned int lkb_count = 0; - log_debug(ls, "dlm_purge_locks"); + /* cache one removed nodeid to optimize the common + case of a single node removed */ + + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { + nodes_count++; + nodeid_gone = memb->nodeid; + } + + if (!nodes_count) + return; down_write(&ls->ls_root_sem); list_for_each_entry(r, &ls->ls_root_list, res_root_list) { hold_rsb(r); lock_rsb(r); - if (is_master(r)) - purge_dead_locks(r); + if (is_master(r)) { + purge_dead_list(ls, r, &r->res_grantqueue, + nodeid_gone, &lkb_count); + purge_dead_list(ls, r, &r->res_convertqueue, + nodeid_gone, &lkb_count); + purge_dead_list(ls, r, &r->res_waitqueue, + nodeid_gone, &lkb_count); + } unlock_rsb(r); unhold_rsb(r); - - schedule(); + cond_resched(); } up_write(&ls->ls_root_sem); - return 0; + if (lkb_count) + log_debug(ls, "dlm_recover_purge %u locks for %u nodes", + lkb_count, nodes_count); } -static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket) +static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket) { struct rb_node *n; - struct dlm_rsb *r, *r_ret = NULL; + struct dlm_rsb *r; spin_lock(&ls->ls_rsbtbl[bucket].lock); for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) { r = rb_entry(n, struct dlm_rsb, res_hashnode); - if (!rsb_flag(r, RSB_LOCKS_PURGED)) + + if (!rsb_flag(r, RSB_RECOVER_GRANT)) + continue; + rsb_clear_flag(r, RSB_RECOVER_GRANT); + if (!is_master(r)) continue; hold_rsb(r); - rsb_clear_flag(r, RSB_LOCKS_PURGED); - r_ret = r; - break; + spin_unlock(&ls->ls_rsbtbl[bucket].lock); + return r; } spin_unlock(&ls->ls_rsbtbl[bucket].lock); - return r_ret; + return NULL; } -void dlm_grant_after_purge(struct dlm_ls *ls) +/* + * Attempt to grant locks on resources that we are the master of. + * Locks may have become grantable during recovery because locks + * from departed nodes have been purged (or not rebuilt), allowing + * previously blocked locks to now be granted. The subset of rsb's + * we are interested in are those with lkb's on either the convert or + * waiting queues. + * + * Simplest would be to go through each master rsb and check for non-empty + * convert or waiting queues, and attempt to grant on those rsbs. + * Checking the queues requires lock_rsb, though, for which we'd need + * to release the rsbtbl lock. This would make iterating through all + * rsb's very inefficient. So, we rely on earlier recovery routines + * to set RECOVER_GRANT on any rsb's that we should attempt to grant + * locks for. + */ + +void dlm_recover_grant(struct dlm_ls *ls) { struct dlm_rsb *r; int bucket = 0; + unsigned int count = 0; + unsigned int rsb_count = 0; + unsigned int lkb_count = 0; while (1) { - r = find_purged_rsb(ls, bucket); + r = find_grant_rsb(ls, bucket); if (!r) { if (bucket == ls->ls_rsbtbl_size - 1) break; bucket++; continue; } + rsb_count++; + count = 0; lock_rsb(r); - if (is_master(r)) { - grant_pending_locks(r); - confirm_master(r, 0); - } + grant_pending_locks(r, &count); + lkb_count += count; + confirm_master(r, 0); unlock_rsb(r); put_rsb(r); - schedule(); + cond_resched(); } + + if (lkb_count) + log_debug(ls, "dlm_recover_grant %u locks on %u resources", + lkb_count, rsb_count); } static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid, @@ -4631,6 +4797,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; struct dlm_lkb *lkb; + uint32_t remid = 0; int error; if (rl->rl_parent_lkid) { @@ -4638,14 +4805,31 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) goto out; } - error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), - R_MASTER, &r); + remid = le32_to_cpu(rl->rl_lkid); + + /* In general we expect the rsb returned to be R_MASTER, but we don't + have to require it. Recovery of masters on one node can overlap + recovery of locks on another node, so one node can send us MSTCPY + locks before we've made ourselves master of this rsb. We can still + add new MSTCPY locks that we receive here without any harm; when + we make ourselves master, dlm_recover_masters() won't touch the + MSTCPY locks we've received early. */ + + error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen), 0, &r); if (error) goto out; + if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) { + log_error(ls, "dlm_recover_master_copy remote %d %x not dir", + rc->rc_header.h_nodeid, remid); + error = -EBADR; + put_rsb(r); + goto out; + } + lock_rsb(r); - lkb = search_remid(r, rc->rc_header.h_nodeid, le32_to_cpu(rl->rl_lkid)); + lkb = search_remid(r, rc->rc_header.h_nodeid, remid); if (lkb) { error = -EEXIST; goto out_remid; @@ -4664,19 +4848,25 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) attach_lkb(r, lkb); add_lkb(r, lkb, rl->rl_status); error = 0; + ls->ls_recover_locks_in++; + + if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) + rsb_set_flag(r, RSB_RECOVER_GRANT); out_remid: /* this is the new value returned to the lock holder for saving in its process-copy lkb */ rl->rl_remid = cpu_to_le32(lkb->lkb_id); + lkb->lkb_recover_seq = ls->ls_recover_seq; + out_unlock: unlock_rsb(r); put_rsb(r); out: - if (error) - log_debug(ls, "recover_master_copy %d %x", error, - le32_to_cpu(rl->rl_lkid)); + if (error && error != -EEXIST) + log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", + rc->rc_header.h_nodeid, remid, error); rl->rl_result = cpu_to_le32(error); return error; } @@ -4687,41 +4877,52 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; struct dlm_lkb *lkb; - int error; + uint32_t lkid, remid; + int error, result; + + lkid = le32_to_cpu(rl->rl_lkid); + remid = le32_to_cpu(rl->rl_remid); + result = le32_to_cpu(rl->rl_result); - error = find_lkb(ls, le32_to_cpu(rl->rl_lkid), &lkb); + error = find_lkb(ls, lkid, &lkb); if (error) { - log_error(ls, "recover_process_copy no lkid %x", - le32_to_cpu(rl->rl_lkid)); + log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); return error; } - DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); - - error = le32_to_cpu(rl->rl_result); - r = lkb->lkb_resource; hold_rsb(r); lock_rsb(r); - switch (error) { + if (!is_process_copy(lkb)) { + log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); + dlm_dump_rsb(r); + unlock_rsb(r); + put_rsb(r); + dlm_put_lkb(lkb); + return -EINVAL; + } + + switch (result) { case -EBADR: /* There's a chance the new master received our lock before dlm_recover_master_reply(), this wouldn't happen if we did a barrier between recover_masters and recover_locks. */ - log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id, - (unsigned long)r, r->res_name); + + log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d", + lkid, rc->rc_header.h_nodeid, remid, result); + dlm_send_rcom_lock(r, lkb); goto out; case -EEXIST: - log_debug(ls, "master copy exists %x", lkb->lkb_id); - /* fall through */ case 0: - lkb->lkb_remid = le32_to_cpu(rl->rl_remid); + lkb->lkb_remid = remid; break; default: - log_error(ls, "dlm_recover_process_copy unknown error %d %x", - error, lkb->lkb_id); + log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk", + lkid, rc->rc_header.h_nodeid, remid, result); } /* an ack for dlm_recover_locks() which waits for replies from diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 1a25530..c8b226c 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -15,7 +15,8 @@ void dlm_dump_rsb(struct dlm_rsb *r); void dlm_print_lkb(struct dlm_lkb *lkb); -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); +void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, + uint32_t saved_seq); void dlm_receive_buffer(union dlm_packet *p, int nodeid); int dlm_modes_compat(int mode1, int mode2); void dlm_put_rsb(struct dlm_rsb *r); @@ -31,9 +32,9 @@ void dlm_adjust_timeouts(struct dlm_ls *ls); int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, unsigned int flags, struct dlm_rsb **r_ret); -int dlm_purge_locks(struct dlm_ls *ls); +void dlm_recover_purge(struct dlm_ls *ls); void dlm_purge_mstcpy_locks(struct dlm_rsb *r); -void dlm_grant_after_purge(struct dlm_ls *ls); +void dlm_recover_grant(struct dlm_ls *ls); int dlm_recover_waiters_post(struct dlm_ls *ls); void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index a1ea25f..ca506abb 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -74,6 +74,19 @@ static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) return len; } +static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", dlm_no_directory(ls)); +} + +static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len) +{ + int val = simple_strtoul(buf, NULL, 0); + if (val == 1) + set_bit(LSFL_NODIR, &ls->ls_flags); + return len; +} + static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf) { uint32_t status = dlm_recover_status(ls); @@ -107,6 +120,12 @@ static struct dlm_attr dlm_attr_id = { .store = dlm_id_store }; +static struct dlm_attr dlm_attr_nodir = { + .attr = {.name = "nodir", .mode = S_IRUGO | S_IWUSR}, + .show = dlm_nodir_show, + .store = dlm_nodir_store +}; + static struct dlm_attr dlm_attr_recover_status = { .attr = {.name = "recover_status", .mode = S_IRUGO}, .show = dlm_recover_status_show @@ -121,6 +140,7 @@ static struct attribute *dlm_attrs[] = { &dlm_attr_control.attr, &dlm_attr_event.attr, &dlm_attr_id.attr, + &dlm_attr_nodir.attr, &dlm_attr_recover_status.attr, &dlm_attr_recover_nodeid.attr, NULL, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 133ef6d..5c1b0e3 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -142,6 +142,7 @@ struct writequeue_entry { static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; +static int dlm_allow_conn; /* Work queues */ static struct workqueue_struct *recv_workqueue; @@ -710,6 +711,13 @@ static int tcp_accept_from_sock(struct connection *con) struct connection *newcon; struct connection *addcon; + mutex_lock(&connections_lock); + if (!dlm_allow_conn) { + mutex_unlock(&connections_lock); + return -1; + } + mutex_unlock(&connections_lock); + memset(&peeraddr, 0, sizeof(peeraddr)); result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock); @@ -1503,6 +1511,7 @@ void dlm_lowcomms_stop(void) socket activity. */ mutex_lock(&connections_lock); + dlm_allow_conn = 0; foreach_conn(stop_conn); mutex_unlock(&connections_lock); @@ -1530,7 +1539,7 @@ int dlm_lowcomms_start(void) if (!dlm_local_count) { error = -ENOTCONN; log_print("no local IP address has been set"); - goto out; + goto fail; } error = -ENOMEM; @@ -1538,7 +1547,13 @@ int dlm_lowcomms_start(void) __alignof__(struct connection), 0, NULL); if (!con_cache) - goto out; + goto fail; + + error = work_start(); + if (error) + goto fail_destroy; + + dlm_allow_conn = 1; /* Start listening */ if (dlm_config.ci_protocol == 0) @@ -1548,20 +1563,17 @@ int dlm_lowcomms_start(void) if (error) goto fail_unlisten; - error = work_start(); - if (error) - goto fail_unlisten; - return 0; fail_unlisten: + dlm_allow_conn = 0; con = nodeid2con(0,0); if (con) { close_connection(con, false); kmem_cache_free(con_cache, con); } +fail_destroy: kmem_cache_destroy(con_cache); - -out: +fail: return error; } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index da64df7..7cd24bc 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -21,21 +21,19 @@ static struct kmem_cache *rsb_cache; int __init dlm_memory_init(void) { - int ret = 0; - lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), __alignof__(struct dlm_lkb), 0, NULL); if (!lkb_cache) - ret = -ENOMEM; + return -ENOMEM; rsb_cache = kmem_cache_create("dlm_rsb", sizeof(struct dlm_rsb), __alignof__(struct dlm_rsb), 0, NULL); if (!rsb_cache) { kmem_cache_destroy(lkb_cache); - ret = -ENOMEM; + return -ENOMEM; } - return ret; + return 0; } void dlm_memory_exit(void) diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index ac5c616..64d3e2b 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -486,47 +486,50 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) return 0; } -static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +/* Called by dlm_recv; corresponds to dlm_receive_message() but special + recovery-only comms are sent through here. */ + +void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { + int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); + int stop, reply = 0, lock = 0; + uint32_t status; uint64_t seq; - int rv = 0; switch (rc->rc_type) { + case DLM_RCOM_LOCK: + lock = 1; + break; + case DLM_RCOM_LOCK_REPLY: + lock = 1; + reply = 1; + break; case DLM_RCOM_STATUS_REPLY: case DLM_RCOM_NAMES_REPLY: case DLM_RCOM_LOOKUP_REPLY: - case DLM_RCOM_LOCK_REPLY: - spin_lock(&ls->ls_recover_lock); - seq = ls->ls_recover_seq; - spin_unlock(&ls->ls_recover_lock); - if (rc->rc_seq_reply != seq) { - log_debug(ls, "ignoring old reply %x from %d " - "seq_reply %llx expect %llx", - rc->rc_type, rc->rc_header.h_nodeid, - (unsigned long long)rc->rc_seq_reply, - (unsigned long long)seq); - rv = 1; - } - } - return rv; -} - -/* Called by dlm_recv; corresponds to dlm_receive_message() but special - recovery-only comms are sent through here. */ + reply = 1; + }; -void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) -{ - int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); + spin_lock(&ls->ls_recover_lock); + status = ls->ls_recover_status; + stop = test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); + seq = ls->ls_recover_seq; + spin_unlock(&ls->ls_recover_lock); - if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { - log_debug(ls, "ignoring recovery message %x from %d", - rc->rc_type, nodeid); + if ((stop && (rc->rc_type != DLM_RCOM_STATUS)) || + (reply && (rc->rc_seq_reply != seq)) || + (lock && !(status & DLM_RS_DIR))) { + log_limit(ls, "dlm_receive_rcom ignore msg %d " + "from %d %llu %llu recover seq %llu sts %x gen %u", + rc->rc_type, + nodeid, + (unsigned long long)rc->rc_seq, + (unsigned long long)rc->rc_seq_reply, + (unsigned long long)seq, + status, ls->ls_generation); goto out; } - if (is_old_reply(ls, rc)) - goto out; - switch (rc->rc_type) { case DLM_RCOM_STATUS: receive_rcom_status(ls, rc); diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 34d5adf1f..7554e4d 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -339,9 +339,12 @@ static void set_lock_master(struct list_head *queue, int nodeid) { struct dlm_lkb *lkb; - list_for_each_entry(lkb, queue, lkb_statequeue) - if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) + list_for_each_entry(lkb, queue, lkb_statequeue) { + if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) { lkb->lkb_nodeid = nodeid; + lkb->lkb_remid = 0; + } + } } static void set_master_lkbs(struct dlm_rsb *r) @@ -354,18 +357,16 @@ static void set_master_lkbs(struct dlm_rsb *r) /* * Propagate the new master nodeid to locks * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. - * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which + * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which * rsb's to consider. */ static void set_new_master(struct dlm_rsb *r, int nodeid) { - lock_rsb(r); r->res_nodeid = nodeid; set_master_lkbs(r); rsb_set_flag(r, RSB_NEW_MASTER); rsb_set_flag(r, RSB_NEW_MASTER2); - unlock_rsb(r); } /* @@ -376,9 +377,9 @@ static void set_new_master(struct dlm_rsb *r, int nodeid) static int recover_master(struct dlm_rsb *r) { struct dlm_ls *ls = r->res_ls; - int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); - - dir_nodeid = dlm_dir_nodeid(r); + int error, ret_nodeid; + int our_nodeid = dlm_our_nodeid(); + int dir_nodeid = dlm_dir_nodeid(r); if (dir_nodeid == our_nodeid) { error = dlm_dir_lookup(ls, our_nodeid, r->res_name, @@ -388,7 +389,9 @@ static int recover_master(struct dlm_rsb *r) if (ret_nodeid == our_nodeid) ret_nodeid = 0; + lock_rsb(r); set_new_master(r, ret_nodeid); + unlock_rsb(r); } else { recover_list_add(r); error = dlm_send_rcom_lookup(r, dir_nodeid); @@ -398,24 +401,33 @@ static int recover_master(struct dlm_rsb *r) } /* - * When not using a directory, most resource names will hash to a new static - * master nodeid and the resource will need to be remastered. + * All MSTCPY locks are purged and rebuilt, even if the master stayed the same. + * This is necessary because recovery can be started, aborted and restarted, + * causing the master nodeid to briefly change during the aborted recovery, and + * change back to the original value in the second recovery. The MSTCPY locks + * may or may not have been purged during the aborted recovery. Another node + * with an outstanding request in waiters list and a request reply saved in the + * requestqueue, cannot know whether it should ignore the reply and resend the + * request, or accept the reply and complete the request. It must do the + * former if the remote node purged MSTCPY locks, and it must do the later if + * the remote node did not. This is solved by always purging MSTCPY locks, in + * which case, the request reply would always be ignored and the request + * resent. */ static int recover_master_static(struct dlm_rsb *r) { - int master = dlm_dir_nodeid(r); + int dir_nodeid = dlm_dir_nodeid(r); + int new_master = dir_nodeid; - if (master == dlm_our_nodeid()) - master = 0; + if (dir_nodeid == dlm_our_nodeid()) + new_master = 0; - if (r->res_nodeid != master) { - if (is_master(r)) - dlm_purge_mstcpy_locks(r); - set_new_master(r, master); - return 1; - } - return 0; + lock_rsb(r); + dlm_purge_mstcpy_locks(r); + set_new_master(r, new_master); + unlock_rsb(r); + return 1; } /* @@ -481,7 +493,9 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) if (nodeid == dlm_our_nodeid()) nodeid = 0; + lock_rsb(r); set_new_master(r, nodeid); + unlock_rsb(r); recover_list_del(r); if (recover_list_empty(ls)) @@ -556,8 +570,6 @@ int dlm_recover_locks(struct dlm_ls *ls) struct dlm_rsb *r; int error, count = 0; - log_debug(ls, "dlm_recover_locks"); - down_read(&ls->ls_root_sem); list_for_each_entry(r, &ls->ls_root_list, res_root_list) { if (is_master(r)) { @@ -584,7 +596,7 @@ int dlm_recover_locks(struct dlm_ls *ls) } up_read(&ls->ls_root_sem); - log_debug(ls, "dlm_recover_locks %d locks", count); + log_debug(ls, "dlm_recover_locks %d out", count); error = dlm_wait_function(ls, &recover_list_empty); out: @@ -721,21 +733,19 @@ static void recover_conversion(struct dlm_rsb *r) } /* We've become the new master for this rsb and waiting/converting locks may - need to be granted in dlm_grant_after_purge() due to locks that may have + need to be granted in dlm_recover_grant() due to locks that may have existed from a removed node. */ -static void set_locks_purged(struct dlm_rsb *r) +static void recover_grant(struct dlm_rsb *r) { if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue)) - rsb_set_flag(r, RSB_LOCKS_PURGED); + rsb_set_flag(r, RSB_RECOVER_GRANT); } void dlm_recover_rsbs(struct dlm_ls *ls) { struct dlm_rsb *r; - int count = 0; - - log_debug(ls, "dlm_recover_rsbs"); + unsigned int count = 0; down_read(&ls->ls_root_sem); list_for_each_entry(r, &ls->ls_root_list, res_root_list) { @@ -744,7 +754,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls) if (rsb_flag(r, RSB_RECOVER_CONVERT)) recover_conversion(r); if (rsb_flag(r, RSB_NEW_MASTER2)) - set_locks_purged(r); + recover_grant(r); recover_lvb(r); count++; } @@ -754,7 +764,8 @@ void dlm_recover_rsbs(struct dlm_ls *ls) } up_read(&ls->ls_root_sem); - log_debug(ls, "dlm_recover_rsbs %d rsbs", count); + if (count) + log_debug(ls, "dlm_recover_rsbs %d done", count); } /* Create a single list of all root rsb's to be used during recovery */ diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 3780caf..f1a9073 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) unsigned long start; int error, neg = 0; - log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq); + log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq); mutex_lock(&ls->ls_recoverd_active); @@ -84,6 +84,8 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) goto fail; } + ls->ls_recover_locks_in = 0; + dlm_set_recover_status(ls, DLM_RS_NODES); error = dlm_recover_members_wait(ls); @@ -130,7 +132,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * Clear lkb's for departed nodes. */ - dlm_purge_locks(ls); + dlm_recover_purge(ls); /* * Get new master nodeid's for rsb's that were mastered on @@ -161,6 +163,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) goto fail; } + log_debug(ls, "dlm_recover_locks %u in", + ls->ls_recover_locks_in); + /* * Finalize state in master rsb's now that all locks can be * checked. This includes conversion resolution and lvb @@ -225,9 +230,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) goto fail; } - dlm_grant_after_purge(ls); + dlm_recover_grant(ls); - log_debug(ls, "dlm_recover %llx generation %u done: %u ms", + log_debug(ls, "dlm_recover %llu generation %u done: %u ms", (unsigned long long)rv->seq, ls->ls_generation, jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); @@ -237,7 +242,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) fail: dlm_release_root_list(ls); - log_debug(ls, "dlm_recover %llx error %d", + log_debug(ls, "dlm_recover %llu error %d", (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); return error; diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index a44fa22..1695f1b 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -19,6 +19,7 @@ struct rq_entry { struct list_head list; + uint32_t recover_seq; int nodeid; struct dlm_message request; }; @@ -41,6 +42,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) return; } + e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF; e->nodeid = nodeid; memcpy(&e->request, ms, ms->m_header.h_length); @@ -63,6 +65,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) int dlm_process_requestqueue(struct dlm_ls *ls) { struct rq_entry *e; + struct dlm_message *ms; int error = 0; mutex_lock(&ls->ls_requestqueue_mutex); @@ -76,7 +79,15 @@ int dlm_process_requestqueue(struct dlm_ls *ls) e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); mutex_unlock(&ls->ls_requestqueue_mutex); - dlm_receive_message_saved(ls, &e->request); + ms = &e->request; + + log_limit(ls, "dlm_process_requestqueue msg %d from %d " + "lkid %x remid %x result %d seq %u", + ms->m_type, ms->m_header.h_nodeid, + ms->m_lkid, ms->m_remid, ms->m_result, + e->recover_seq); + + dlm_receive_message_saved(ls, &e->request, e->recover_seq); mutex_lock(&ls->ls_requestqueue_mutex); list_del(&e->list); @@ -138,35 +149,7 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) if (!dlm_no_directory(ls)) return 0; - /* with no directory, the master is likely to change as a part of - recovery; requests to/from the defunct master need to be purged */ - - switch (type) { - case DLM_MSG_REQUEST: - case DLM_MSG_CONVERT: - case DLM_MSG_UNLOCK: - case DLM_MSG_CANCEL: - /* we're no longer the master of this resource, the sender - will resend to the new master (see waiter_needs_recovery) */ - - if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid()) - return 1; - break; - - case DLM_MSG_REQUEST_REPLY: - case DLM_MSG_CONVERT_REPLY: - case DLM_MSG_UNLOCK_REPLY: - case DLM_MSG_CANCEL_REPLY: - case DLM_MSG_GRANT: - /* this reply is from the former master of the resource, - we'll resend to the new master if needed */ - - if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid) - return 1; - break; - } - - return 0; + return 1; } void dlm_purge_requestqueue(struct dlm_ls *ls) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index ab22480..a750f95 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -303,7 +303,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, mutex_unlock(&ecryptfs_daemon_hash_mux); goto wake_up; } - tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns; + tsk_user_ns = __task_cred(msg_ctx->task)->user_ns; ctx_euid = task_euid(msg_ctx->task); rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns); rcu_read_unlock(); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 2dd946b..e879cf8 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -133,7 +133,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) static void ecryptfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); iput(ecryptfs_inode_to_lower(inode)); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c0b3c70..079d1be 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -33,6 +33,7 @@ #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> +#include <linux/device.h> #include <asm/uaccess.h> #include <asm/io.h> #include <asm/mman.h> @@ -87,7 +88,7 @@ */ /* Epoll private bits inside the event mask */ -#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 @@ -154,6 +155,9 @@ struct epitem { /* List header used to link this item to the "struct file" items list */ struct list_head fllink; + /* wakeup_source used when EPOLLWAKEUP is set */ + struct wakeup_source *ws; + /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; @@ -194,6 +198,9 @@ struct eventpoll { */ struct epitem *ovflist; + /* wakeup_source used when ep_scan_ready_list is running */ + struct wakeup_source *ws; + /* The user that created the eventpoll descriptor */ struct user_struct *user; @@ -588,8 +595,10 @@ static int ep_scan_ready_list(struct eventpoll *ep, * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ - if (!ep_is_linked(&epi->rdllink)) + if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); + } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after @@ -602,6 +611,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, * Quickly re-inject items left on "txlist". */ list_splice(&txlist, &ep->rdllist); + __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { /* @@ -656,6 +666,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); + wakeup_source_unregister(epi->ws); + /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); @@ -706,6 +718,7 @@ static void ep_free(struct eventpoll *ep) mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); free_uid(ep->user); + wakeup_source_unregister(ep->ws); kfree(ep); } @@ -737,6 +750,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ + __pm_relax(epi->ws); list_del_init(&epi->rdllink); } } @@ -927,13 +941,23 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; + if (epi->ws) { + /* + * Activate ep->ws since epi->ws may get + * deactivated at any time. + */ + __pm_stay_awake(ep->ws); + } + } goto out_unlock; } /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(&epi->rdllink)) + if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); + } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() @@ -1091,6 +1115,30 @@ static int reverse_path_check(void) return error; } +static int ep_create_wakeup_source(struct epitem *epi) +{ + const char *name; + + if (!epi->ep->ws) { + epi->ep->ws = wakeup_source_register("eventpoll"); + if (!epi->ep->ws) + return -ENOMEM; + } + + name = epi->ffd.file->f_path.dentry->d_name.name; + epi->ws = wakeup_source_register(name); + if (!epi->ws) + return -ENOMEM; + + return 0; +} + +static void ep_destroy_wakeup_source(struct epitem *epi) +{ + wakeup_source_unregister(epi->ws); + epi->ws = NULL; +} + /* * Must be called with "mtx" held. */ @@ -1118,6 +1166,13 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; + if (epi->event.events & EPOLLWAKEUP) { + error = ep_create_wakeup_source(epi); + if (error) + goto error_create_wakeup_source; + } else { + epi->ws = NULL; + } /* Initialize the poll table using the queue callback */ epq.epi = epi; @@ -1164,6 +1219,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1204,6 +1260,9 @@ error_unregister: list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); + wakeup_source_unregister(epi->ws); + +error_create_wakeup_source: kmem_cache_free(epi_cache, epi); return error; @@ -1229,6 +1288,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even epi->event.events = event->events; pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ + if (epi->event.events & EPOLLWAKEUP) { + if (!epi->ws) + ep_create_wakeup_source(epi); + } else if (epi->ws) { + ep_destroy_wakeup_source(epi); + } /* * Get current event bits. We can safely use the file* here because @@ -1244,6 +1309,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) @@ -1282,6 +1348,18 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, !list_empty(head) && eventcnt < esed->maxevents;) { epi = list_first_entry(head, struct epitem, rdllink); + /* + * Activate ep->ws before deactivating epi->ws to prevent + * triggering auto-suspend here (in case we reactive epi->ws + * below). + * + * This could be rearranged to delay the deactivation of epi->ws + * instead, but then epi->ws would temporarily be out of sync + * with ep_is_linked(). + */ + if (epi->ws && epi->ws->active) + __pm_stay_awake(ep->ws); + __pm_relax(epi->ws); list_del_init(&epi->rdllink); pt._key = epi->event.events; @@ -1298,6 +1376,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); + __pm_stay_awake(epi->ws); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; @@ -1317,6 +1396,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); + __pm_stay_awake(epi->ws); } } } @@ -1629,6 +1709,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, if (!tfile->f_op || !tfile->f_op->poll) goto error_tgt_fput; + /* Check if EPOLLWAKEUP is allowed */ + if ((epds.events & EPOLLWAKEUP) && !capable(CAP_EPOLLWAKEUP)) + epds.events &= ~EPOLLWAKEUP; + /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit @@ -1139,7 +1139,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; - if (current_euid() == current_uid() && current_egid() == current_gid()) + if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) set_dumpable(current->mm, 1); else set_dumpable(current->mm, suid_dumpable); @@ -1153,8 +1153,8 @@ void setup_new_exec(struct linux_binprm * bprm) current->mm->task_size = TASK_SIZE; /* install the new credentials */ - if (bprm->cred->uid != current_euid() || - bprm->cred->gid != current_egid()) { + if (!uid_eq(bprm->cred->uid, current_euid()) || + !gid_eq(bprm->cred->gid, current_egid())) { current->pdeath_signal = 0; } else { would_dump(bprm, bprm->file); @@ -1245,6 +1245,13 @@ static int check_unsafe_exec(struct linux_binprm *bprm) bprm->unsafe |= LSM_UNSAFE_PTRACE; } + /* + * This isn't strictly necessary, but it makes it harder for LSMs to + * mess up. + */ + if (current->no_new_privs) + bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); @@ -1288,11 +1295,15 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->cred->euid = current_euid(); bprm->cred->egid = current_egid(); - if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) { + if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && + !current->no_new_privs) { /* Set-uid? */ if (mode & S_ISUID) { + if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid)) + return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = inode->i_uid; + } /* Set-gid? */ @@ -1302,6 +1313,8 @@ int prepare_binprm(struct linux_binprm *bprm) * executable. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) + return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = inode->i_gid; } @@ -1930,8 +1943,21 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); - if (core_waiters > 0) + if (core_waiters > 0) { + struct core_thread *ptr; + wait_for_completion(&core_state->startup); + /* + * Wait for all the threads to become inactive, so that + * all the thread context (extended register state, like + * fpu etc) gets copied to the memory. + */ + ptr = core_state->dumper.next; + while (ptr != NULL) { + wait_task_inactive(ptr->task, 0); + ptr = ptr->next; + } + } return core_waiters; } @@ -2121,7 +2147,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) if (__get_dumpable(cprm.mm_flags) == 2) { /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ - cred->fsuid = 0; /* Dump root private */ + cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */ } retval = coredump_wait(exit_code, &core_state); @@ -2222,7 +2248,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) * Dont allow local users get cute and trick others to coredump * into their pre-created files. */ - if (inode->i_uid != current_fsuid()) + if (!uid_eq(inode->i_uid, current_fsuid())) goto close_fail; if (!cprm.file->f_op || !cprm.file->f_op->write) goto close_fail; diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 352ba14..389ba83 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -16,5 +16,5 @@ libore-y := ore.o ore_raid.o obj-$(CONFIG_ORE) += libore.o -exofs-y := inode.o file.o symlink.o namei.o dir.o super.o +exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index ca9d496..fffe86f 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -56,6 +56,9 @@ struct exofs_dev { struct ore_dev ored; unsigned did; + unsigned urilen; + uint8_t *uri; + struct kobject ed_kobj; }; /* * our extension to the in-memory superblock @@ -73,6 +76,7 @@ struct exofs_sb_info { struct ore_layout layout; /* Default files layout */ struct ore_comp one_comp; /* id & cred of partition id=0*/ struct ore_components oc; /* comps for the partition */ + struct kobject s_kobj; /* holds per-sbi kobject */ }; /* @@ -176,6 +180,16 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj); int exofs_sbi_write_stats(struct exofs_sb_info *sbi); +/* sys.c */ +int exofs_sysfs_init(void); +void exofs_sysfs_uninit(void); +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev); +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi); +int exofs_sysfs_odev_add(struct exofs_dev *edev, + struct exofs_sb_info *sbi); +void exofs_sysfs_dbg_print(void); + /********************* * operation vectors * *********************/ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index ea5e1f9..5badb0c 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1473,7 +1473,7 @@ void exofs_evict_inode(struct inode *inode) goto no_delete; inode->i_size = 0; - end_writeback(inode); + clear_inode(inode); /* if we are deleting an obj that hasn't been created yet, wait. * This also makes sure that create_done cannot be called with an @@ -1503,5 +1503,5 @@ void exofs_evict_inode(struct inode *inode) return; no_delete: - end_writeback(inode); + clear_inode(inode); } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 735ca06..4337836 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -472,6 +472,7 @@ static void exofs_put_super(struct super_block *sb) _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); + exofs_sysfs_sb_del(sbi); bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; @@ -632,6 +633,12 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], (numdevs - 1) * sizeof(sbi->oc.ods[0])); + /* create sysfs subdir under which we put the device table + * And cluster layout. A Superblock is identified by the string: + * "dev[0].osdname"_"pid" + */ + exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]); + for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; struct osd_dev_info odi; @@ -657,6 +664,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, eds[i].ored.od = fscb_od; ++sbi->oc.numdevs; fscb_od = NULL; + exofs_sysfs_odev_add(&eds[i], sbi); continue; } @@ -682,6 +690,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, odi.osdname); goto out; } + exofs_sysfs_odev_add(&eds[i], sbi); /* TODO: verify other information is correct and FS-uuid * matches. Benny what did you say about device table @@ -745,7 +754,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->one_comp.obj.partition = opts->pid; sbi->one_comp.obj.id = 0; exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->oc.numdevs = 1; sbi->oc.single_comp = EC_SINGLE_COMP; sbi->oc.comps = &sbi->one_comp; @@ -804,6 +812,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; ore_comp_set_dev(&sbi->oc, 0, od); + sbi->oc.numdevs = 1; } __sbi_read_stats(sbi); @@ -844,6 +853,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } + exofs_sysfs_dbg_print(); _exofs_print_device("Mounting", opts->dev_name, ore_comp_dev(&sbi->oc, 0), sbi->one_comp.obj.partition); @@ -1023,6 +1033,9 @@ static int __init init_exofs(void) if (err) goto out_d; + /* We don't fail if sysfs creation failed */ + exofs_sysfs_init(); + return 0; out_d: destroy_inodecache(); @@ -1032,6 +1045,7 @@ out: static void __exit exit_exofs(void) { + exofs_sysfs_uninit(); unregister_filesystem(&exofs_type); destroy_inodecache(); } diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c new file mode 100644 index 0000000..e32bc91 --- /dev/null +++ b/fs/exofs/sys.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2012 + * Sachin Bhamare <sbhamare@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License 2 as published by + * the Free Software Foundation. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the: + * Free Software Foundation <licensing@fsf.org> + */ + +#include <linux/kobject.h> +#include <linux/device.h> + +#include "exofs.h" + +struct odev_attr { + struct attribute attr; + ssize_t (*show)(struct exofs_dev *, char *); + ssize_t (*store)(struct exofs_dev *, const char *, size_t); +}; + +static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->show ? a->show(edp, buf) : 0; +} + +static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->store ? a->store(edp, buf, len) : len; +} + +static const struct sysfs_ops odev_attr_ops = { + .show = odev_attr_show, + .store = odev_attr_store, +}; + + +static struct kset *exofs_kset; + +static ssize_t osdname_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname); +} + +static ssize_t systemid_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + memcpy(buf, odi->systemid, odi->systemid_len); + return odi->systemid_len; +} + +static ssize_t uri_show(struct exofs_dev *edp, char *buf) +{ + return snprintf(buf, edp->urilen, "%s", edp->uri); +} + +static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) +{ + edp->urilen = strlen(buf) + 1; + edp->uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); + strncpy(edp->uri, buf, edp->urilen); + return edp->urilen; +} + +#define OSD_ATTR(name, mode, show, store) \ + static struct odev_attr odev_attr_##name = \ + __ATTR(name, mode, show, store) + +OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL); +OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL); +OSD_ATTR(uri, S_IRWXU, uri_show, uri_store); + +static struct attribute *odev_attrs[] = { + &odev_attr_osdname.attr, + &odev_attr_systemid.attr, + &odev_attr_uri.attr, + NULL, +}; + +static struct kobj_type odev_ktype = { + .default_attrs = odev_attrs, + .sysfs_ops = &odev_attr_ops, +}; + +static struct kobj_type uuid_ktype = { +}; + +void exofs_sysfs_dbg_print() +{ +#ifdef CONFIG_EXOFS_DEBUG + struct kobject *k_name, *k_tmp; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + printk(KERN_INFO "%s: name %s ref %d\n", + __func__, kobject_name(k_name), + (int)atomic_read(&k_name->kref.refcount)); + } +#endif +} +/* + * This function removes all kobjects under exofs_kset + * At the end of it, exofs_kset kobject will have a refcount + * of 1 which gets decremented only on exofs module unload + */ +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi) +{ + struct kobject *k_name, *k_tmp; + struct kobject *s_kobj = &sbi->s_kobj; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + /* Remove all that are children of this SBI */ + if (k_name->parent == s_kobj) + kobject_put(k_name); + } + kobject_put(s_kobj); +} + +/* + * This function creates sysfs entries to hold the current exofs cluster + * instance (uniquely identified by osdname,pid tuple). + * This function gets called once per exofs mount instance. + */ +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev) +{ + struct kobject *s_kobj; + int retval = 0; + uint64_t pid = sbi->one_comp.obj.partition; + + /* allocate new uuid dirent */ + s_kobj = &sbi->s_kobj; + s_kobj->kset = exofs_kset; + retval = kobject_init_and_add(s_kobj, &uuid_ktype, + &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval); + return -ENOMEM; + } + return 0; +} + +int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi) +{ + struct kobject *d_kobj; + int retval = 0; + + /* create osd device group which contains following attributes + * osdname, systemid & uri + */ + d_kobj = &edev->ed_kobj; + d_kobj->kset = exofs_kset; + retval = kobject_init_and_add(d_kobj, &odev_ktype, + &sbi->s_kobj, "dev%u", edev->did); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "device dev%u\n", edev->did); + return retval; + } + return 0; +} + +int exofs_sysfs_init(void) +{ + exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj); + if (!exofs_kset) { + EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n"); + return -ENOMEM; + } + return 0; +} + +void exofs_sysfs_uninit(void) +{ + kset_unregister(exofs_kset); +} diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index a8cbe1b..1c36139 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -165,7 +165,6 @@ static void release_blocks(struct super_block *sb, int count) struct ext2_sb_info *sbi = EXT2_SB(sb); percpu_counter_add(&sbi->s_freeblocks_counter, count); - sb->s_dirt = 1; } } @@ -180,7 +179,6 @@ static void group_adjust_blocks(struct super_block *sb, int group_no, free_blocks = le16_to_cpu(desc->bg_free_blocks_count); desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count); spin_unlock(sb_bgl_lock(sbi, group_no)); - sb->s_dirt = 1; mark_buffer_dirty(bh); } } @@ -479,7 +477,7 @@ void ext2_discard_reservation(struct inode *inode) } /** - * ext2_free_blocks_sb() -- Free given blocks and update quota and i_blocks + * ext2_free_blocks() -- Free given blocks and update quota and i_blocks * @inode: inode * @block: start physcial block to free * @count: number of blocks to free @@ -1193,8 +1191,9 @@ static int ext2_has_free_blocks(struct ext2_sb_info *sbi) free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current_fsuid() && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + !uid_eq(sbi->s_resuid, current_fsuid()) && + (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || + !in_group_p (sbi->s_resgid))) { return 0; } return 1; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 0b2b4db..d9a17d0 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -82,8 +82,8 @@ struct ext2_sb_info { struct buffer_head ** s_group_desc; unsigned long s_mount_opt; unsigned long s_sb_block; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; @@ -637,8 +637,8 @@ static inline void verify_offsets(void) */ struct ext2_mount_options { unsigned long s_mount_opt; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; }; /* diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8b15cf8..c13eb7b 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -81,7 +81,6 @@ static void ext2_release_inode(struct super_block *sb, int group, int dir) spin_unlock(sb_bgl_lock(EXT2_SB(sb), group)); if (dir) percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter); - sb->s_dirt = 1; mark_buffer_dirty(bh); } @@ -543,7 +542,6 @@ got: } spin_unlock(sb_bgl_lock(sbi, group)); - sb->s_dirt = 1; mark_buffer_dirty(bh2); if (test_opt(sb, GRPID)) { inode->i_mode = mode; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 740cad8..264d315 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -90,7 +90,7 @@ void ext2_evict_inode(struct inode * inode) } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); ext2_discard_reservation(inode); rsv = EXT2_I(inode)->i_block_alloc_info; @@ -1293,6 +1293,8 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) struct inode *inode; long ret = -EIO; int n; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -1310,12 +1312,14 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (!(test_opt (inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); @@ -1413,8 +1417,8 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; - uid_t uid = inode->i_uid; - gid_t gid = inode->i_gid; + uid_t uid = i_uid_read(inode); + gid_t gid = i_gid_read(inode); struct buffer_head * bh; struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh); int n; @@ -1529,8 +1533,8 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (is_quota_modification(inode, iattr)) dquot_initialize(inode); - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || + (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { error = dquot_transfer(inode, iattr); if (error) return error; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index dffb865..f663a67 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -79,7 +79,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str struct dentry *ext2_get_parent(struct dentry *child) { - struct qstr dotdot = {.name = "..", .len = 2}; + struct qstr dotdot = QSTR_INIT("..", 2); unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot); if (!ino) return ERR_PTR(-ENOENT); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index e1025c7..b3621cb 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -130,9 +130,6 @@ static void ext2_put_super (struct super_block * sb) dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - if (sb->s_dirt) - ext2_write_super(sb); - ext2_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; @@ -228,13 +225,15 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpid"); if (!test_opt(sb, GRPID) && (def_mount_opts & EXT2_DEFM_BSDGROUPS)) seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT2_DEF_RESUID || + if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT2_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT2_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); + seq_printf(seq, ",resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); } - if (sbi->s_resgid != EXT2_DEF_RESGID || + if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT2_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT2_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); + seq_printf(seq, ",resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); } if (test_opt(sb, ERRORS_RO)) { int def_errors = le16_to_cpu(es->s_errors); @@ -305,7 +304,6 @@ static const struct super_operations ext2_sops = { .write_inode = ext2_write_inode, .evict_inode = ext2_evict_inode, .put_super = ext2_put_super, - .write_super = ext2_write_super, .sync_fs = ext2_sync_fs, .statfs = ext2_statfs, .remount_fs = ext2_remount, @@ -356,11 +354,6 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid, ext2_nfs_get_inode); } -/* Yes, most of these are left as NULL!! - * A NULL value implies the default, which works with ext2-like file - * systems, but can be improved upon. - * Currently only get_parent is required. - */ static const struct export_operations ext2_export_ops = { .fh_to_dentry = ext2_fh_to_dentry, .fh_to_parent = ext2_fh_to_parent, @@ -436,6 +429,8 @@ static int parse_options(char *options, struct super_block *sb) struct ext2_sb_info *sbi = EXT2_SB(sb); substring_t args[MAX_OPT_ARGS]; int option; + kuid_t uid; + kgid_t gid; if (!options) return 1; @@ -462,12 +457,23 @@ static int parse_options(char *options, struct super_block *sb) case Opt_resuid: if (match_int(&args[0], &option)) return 0; - sbi->s_resuid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option); + return -1; + + } + sbi->s_resuid = uid; break; case Opt_resgid: if (match_int(&args[0], &option)) return 0; - sbi->s_resgid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option); + return -1; + } + sbi->s_resgid = gid; break; case Opt_sb: /* handled by get_sb_block() instead of here */ @@ -841,8 +847,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) else set_opt(sbi->s_mount_opt, ERRORS_RO); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); set_opt(sbi->s_mount_opt, RESERVATION); @@ -1161,7 +1167,6 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, mark_buffer_dirty(EXT2_SB(sb)->s_sbh); if (wait) sync_dirty_buffer(EXT2_SB(sb)->s_sbh); - sb->s_dirt = 0; } /* @@ -1194,8 +1199,6 @@ void ext2_write_super(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) ext2_sync_fs(sb, 1); - else - sb->s_dirt = 0; } static int ext2_remount (struct super_block * sb, int * flags, char * data) @@ -1441,7 +1444,6 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -1471,16 +1473,13 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) { - mutex_unlock(&inode->i_mutex); + if (len == towrite) return err; - } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); return len - towrite; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 6dcafc7..b6754db 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -339,7 +339,6 @@ static void ext2_xattr_update_super_block(struct super_block *sb) spin_lock(&EXT2_SB(sb)->s_lock); EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); spin_unlock(&EXT2_SB(sb)->s_lock); - sb->s_dirt = 1; mark_buffer_dirty(EXT2_SB(sb)->s_sbh); } diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index baac1b1..25cd608 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1439,8 +1439,9 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation) free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - !use_reservation && sbi->s_resuid != current_fsuid() && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) && + (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) || + !in_group_p (sbi->s_resgid))) { return 0; } return 1; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index cc761ad8..92490e9 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -21,30 +21,15 @@ * */ +#include <linux/compat.h> #include "ext3.h" static unsigned char ext3_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int ext3_readdir(struct file *, void *, filldir_t); static int ext3_dx_readdir(struct file * filp, void * dirent, filldir_t filldir); -static int ext3_release_dir (struct inode * inode, - struct file * filp); - -const struct file_operations ext3_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = ext3_readdir, /* we take BKL. needed?*/ - .unlocked_ioctl = ext3_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext3_compat_ioctl, -#endif - .fsync = ext3_sync_file, /* BKL held */ - .release = ext3_release_dir, -}; - static unsigned char get_dtype(struct super_block *sb, int filetype) { @@ -55,6 +40,25 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext3_filetype_table[filetype]); } +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which chould potentially get coverted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_INDEX) && + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} int ext3_check_dir_entry (const char * function, struct inode * dir, struct ext3_dir_entry_2 * de, @@ -94,18 +98,13 @@ static int ext3_readdir(struct file * filp, unsigned long offset; int i, stored; struct ext3_dir_entry_2 *de; - struct super_block *sb; int err; struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; int ret = 0; int dir_has_error = 0; - sb = inode->i_sb; - - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, - EXT3_FEATURE_COMPAT_DIR_INDEX) && - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + if (is_dx_dir(inode)) { err = ext3_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { ret = err; @@ -227,22 +226,87 @@ out: return ret; } +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + /* * These functions convert from the major/minor hash to an f_pos - * value. + * value for dx directories * - * Currently we only use major hash numer. This is unfortunate, but - * on 32-bit machines, the same VFS interface is used for lseek and - * llseek, so if we use the 64 bit offset, then the 32-bit versions of - * lseek/telldir/seekdir will blow out spectacularly, and from within - * the ext2 low-level routine, we don't know if we're being called by - * a 64-bit version of the system call or the 32-bit version of the - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir - * cookie. Sigh. + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ -#define hash2pos(major, minor) (major >> 1) -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) -#define pos2min_hash(pos) (0) +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext3_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT3_HTREE_EOF_32BIT; + else + return EXT3_HTREE_EOF_64BIT; +} + + +/* + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * Because we may return a 64-bit hash that is well beyond s_maxbytes, + * we need to pass the max hash as the maximum allowable offset in + * the htree directory case. + * + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory + */ +loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int dx_dir = is_dx_dir(inode); + + if (likely(dx_dir)) + return generic_file_llseek_size(file, offset, origin, + ext3_get_htree_eof(file)); + else + return generic_file_llseek(file, offset, origin); +} /* * This structure holds the nodes of the red-black tree used to store @@ -303,15 +367,16 @@ static void free_rb_tree_fname(struct rb_root *root) } -static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) +static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, + loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->curr_hash = pos2maj_hash(pos); - p->curr_minor_hash = pos2min_hash(pos); + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); return p; } @@ -401,7 +466,7 @@ static int call_filldir(struct file * filp, void * dirent, printk("call_filldir: called with null fname?!?\n"); return 0; } - curr_pos = hash2pos(fname->hash, fname->minor_hash); + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); while (fname) { error = filldir(dirent, fname->name, fname->name_len, curr_pos, @@ -426,13 +491,13 @@ static int ext3_dx_readdir(struct file * filp, int ret; if (!info) { - info = ext3_htree_create_dir_info(filp->f_pos); + info = ext3_htree_create_dir_info(filp, filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; } - if (filp->f_pos == EXT3_HTREE_EOF) + if (filp->f_pos == ext3_get_htree_eof(filp)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ @@ -440,8 +505,8 @@ static int ext3_dx_readdir(struct file * filp, free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp->f_pos); + info->curr_hash = pos2maj_hash(filp, filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); } /* @@ -473,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp, if (ret < 0) return ret; if (ret == 0) { - filp->f_pos = EXT3_HTREE_EOF; + filp->f_pos = ext3_get_htree_eof(filp); break; } info->curr_node = rb_first(&info->root); @@ -493,7 +558,7 @@ static int ext3_dx_readdir(struct file * filp, info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { - filp->f_pos = EXT3_HTREE_EOF; + filp->f_pos = ext3_get_htree_eof(filp); break; } info->curr_hash = info->next_hash; @@ -512,3 +577,15 @@ static int ext3_release_dir (struct inode * inode, struct file * filp) return 0; } + +const struct file_operations ext3_dir_operations = { + .llseek = ext3_dir_llseek, + .read = generic_read_dir, + .readdir = ext3_readdir, + .unlocked_ioctl = ext3_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext3_compat_ioctl, +#endif + .fsync = ext3_sync_file, + .release = ext3_release_dir, +}; diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h index b6515fd..e85ff15 100644 --- a/fs/ext3/ext3.h +++ b/fs/ext3/ext3.h @@ -243,8 +243,8 @@ struct ext3_new_group_data { */ struct ext3_mount_options { unsigned long s_mount_opt; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned long s_commit_interval; #ifdef CONFIG_QUOTA int s_jquota_fmt; @@ -637,8 +637,8 @@ struct ext3_sb_info { struct buffer_head ** s_group_desc; unsigned long s_mount_opt; ext3_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; @@ -920,7 +920,11 @@ struct dx_hash_info u32 *seed; }; -#define EXT3_HTREE_EOF 0x7fffffff + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + /* * Control parameters used by ext3_htree_next_block diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index d10231d..ede315c 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -198,8 +198,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) return -1; } hash = hash & ~1; - if (hash == (EXT3_HTREE_EOF << 1)) - hash = (EXT3_HTREE_EOF-1) << 1; + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; hinfo->hash = hash; hinfo->minor_hash = minor_hash; return 0; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index e3c39e4..082afd7 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -180,8 +180,7 @@ error_return: * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). + * it has too few free blocks left (min_blocks). * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more @@ -191,21 +190,16 @@ error_return: * when we allocate an inode, within 0--255. */ -#define INODE_COST 64 -#define BLOCK_COST 256 - static int find_group_orlov(struct super_block *sb, struct inode *parent) { int parent_group = EXT3_I(parent)->i_block_group; struct ext3_sb_info *sbi = EXT3_SB(sb); - struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext3_fsblk_t freeb, avefreeb; - ext3_fsblk_t blocks_per_dir; unsigned int ndirs; - int max_debt, max_dirs, min_inodes; + int max_dirs, min_inodes; ext3_grpblk_t min_blocks; int group = -1, i; struct ext3_group_desc *desc; @@ -242,20 +236,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; - max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST); - if (max_debt * INODE_COST > inodes_per_group) - max_debt = inodes_per_group / INODE_COST; - if (max_debt > 255) - max_debt = 255; - if (max_debt == 0) - max_debt = 1; - for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; desc = ext3_get_group_desc (sb, group, NULL); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 10d7812..9a4a5c4 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -272,18 +272,18 @@ void ext3_evict_inode (struct inode *inode) if (ext3_mark_inode_dirty(handle, inode)) { /* If that failed, just dquot_drop() and be done with that */ dquot_drop(inode); - end_writeback(inode); + clear_inode(inode); } else { ext3_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); - end_writeback(inode); + clear_inode(inode); ext3_free_inode(handle, inode); } ext3_journal_stop(handle); return; no_delete: - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); } @@ -2891,6 +2891,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) transaction_t *transaction; long ret; int block; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -2907,12 +2909,14 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) bh = iloc.bh; raw_inode = ext3_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if(!(test_opt (inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); @@ -3068,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle, struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + uid_t i_uid; + gid_t i_gid; again: /* we can't allow multiple procs in here at once, its a bit racey */ @@ -3080,27 +3086,29 @@ again: ext3_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if(!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); + cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } @@ -3262,8 +3270,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index d7940b2..eeb63df 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1045,7 +1045,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str struct dentry *ext3_get_parent(struct dentry *child) { unsigned long ino; - struct qstr dotdot = {.name = "..", .len = 2}; + struct qstr dotdot = QSTR_INIT("..", 2); struct ext3_dir_entry_2 * de; struct buffer_head *bh; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index cf0b592..8c3a44b 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -617,13 +617,15 @@ static int ext3_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpid"); if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS)) seq_puts(seq, ",nogrpid"); - if (sbi->s_resuid != EXT3_DEF_RESUID || + if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) { - seq_printf(seq, ",resuid=%u", sbi->s_resuid); + seq_printf(seq, ",resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); } - if (sbi->s_resgid != EXT3_DEF_RESGID || + if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) { - seq_printf(seq, ",resgid=%u", sbi->s_resgid); + seq_printf(seq, ",resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); } if (test_opt(sb, ERRORS_RO)) { int def_errors = le16_to_cpu(es->s_errors); @@ -967,6 +969,8 @@ static int parse_options (char *options, struct super_block *sb, substring_t args[MAX_OPT_ARGS]; int data_opt = 0; int option; + kuid_t uid; + kgid_t gid; #ifdef CONFIG_QUOTA int qfmt; #endif @@ -1000,12 +1004,23 @@ static int parse_options (char *options, struct super_block *sb, case Opt_resuid: if (match_int(&args[0], &option)) return 0; - sbi->s_resuid = option; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) { + ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option); + return -1; + + } + sbi->s_resuid = uid; break; case Opt_resgid: if (match_int(&args[0], &option)) return 0; - sbi->s_resgid = option; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) { + ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option); + return -1; + } + sbi->s_resgid = gid; break; case Opt_sb: /* handled by get_sb_block() instead of here */ @@ -1651,8 +1666,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; - sbi->s_resuid = EXT3_DEF_RESUID; - sbi->s_resgid = EXT3_DEF_RESGID; + sbi->s_resuid = make_kuid(&init_user_ns, EXT3_DEF_RESUID); + sbi->s_resgid = make_kgid(&init_user_ns, EXT3_DEF_RESGID); sbi->s_sb_block = sb_block; blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); @@ -1716,8 +1731,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) else set_opt(sbi->s_mount_opt, ERRORS_RO); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); /* enable barriers by default */ set_opt(sbi->s_mount_opt, BARRIER); @@ -3000,7 +3015,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); bh = ext3_bread(handle, inode, blk, 1, &err); if (!bh) goto out; @@ -3024,10 +3038,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, } brelse(bh); out: - if (err) { - mutex_unlock(&inode->i_mutex); + if (err) return err; - } if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; @@ -3035,7 +3047,6 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); - mutex_unlock(&inode->i_mutex); return len; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 4bbd07a..c45c411 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -461,8 +461,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ - if (sbi->s_resuid == current_fsuid() || - ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || + if (uid_eq(sbi->s_resuid, current_fsuid()) || + (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0e01e90..c21b1de5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1153,8 +1153,8 @@ struct ext4_sb_info { unsigned int s_mount_flags; unsigned int s_def_mount_opt; ext4_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 409c2ee..9f9acac 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -808,8 +808,8 @@ got: } if (owner) { inode->i_mode = mode; - inode->i_uid = owner[0]; - inode->i_gid = owner[1]; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode->i_uid = current_fsuid(); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c77b0bd..07eaf56 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3630,6 +3630,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) journal_t *journal = EXT4_SB(sb)->s_journal; long ret; int block; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -3645,12 +3647,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (!(test_opt(inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -3870,6 +3874,8 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + uid_t i_uid; + gid_t i_gid; /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ @@ -3878,27 +3884,27 @@ static int ext4_do_update_inode(handle_t *handle, ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if (!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if (!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { - raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); - raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } @@ -4084,8 +4090,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f39f80f..f1bb32e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -466,8 +466,8 @@ int ext4_ext_migrate(struct inode *inode) } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; - owner[0] = inode->i_uid; - owner[1] = inode->i_gid; + owner[0] = i_uid_read(inode); + owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 349d7b3..e2a3f4b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1052,10 +1052,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - static const struct qstr dotdot = { - .name = "..", - .len = 2, - }; + static const struct qstr dotdot = QSTR_INIT("..", 2); struct ext4_dir_entry_2 * de; struct buffer_head *bh; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e1fb1d5..35b5954 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1007,7 +1007,7 @@ static void destroy_inodecache(void) void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); ext4_discard_preallocations(inode); if (EXT4_I(inode)->jinode) { @@ -1448,6 +1448,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, { struct ext4_sb_info *sbi = EXT4_SB(sb); const struct mount_opts *m; + kuid_t uid; + kgid_t gid; int arg = 0; #ifdef CONFIG_QUOTA @@ -1474,10 +1476,20 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, "Ignoring removed %s option", opt); return 1; case Opt_resuid: - sbi->s_resuid = arg; + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); + return -1; + } + sbi->s_resuid = uid; return 1; case Opt_resgid: - sbi->s_resgid = arg; + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); + return -1; + } + sbi->s_resgid = gid; return 1; case Opt_abort: sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; @@ -1732,12 +1744,14 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("%s", token2str(m->token)); } - if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID || + if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) - SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid); - if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID || + SEQ_OPTS_PRINT("resuid=%u", + from_kuid_munged(&init_user_ns, sbi->s_resuid)); + if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) - SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid); + SEQ_OPTS_PRINT("resgid=%u", + from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); @@ -2980,8 +2994,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; - sbi->s_resuid = EXT4_DEF_RESUID; - sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); + sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; if (sb->s_bdev->bd_part) @@ -3060,8 +3074,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); + sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; @@ -4213,8 +4227,8 @@ static int ext4_unfreeze(struct super_block *sb) struct ext4_mount_options { unsigned long s_mount_opt; unsigned long s_mount_opt2; - uid_t s_resuid; - gid_t s_resgid; + kuid_t s_resuid; + kgid_t s_resgid; unsigned long s_commit_interval; u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA @@ -4744,7 +4758,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; @@ -4760,16 +4773,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: - if (err) { - mutex_unlock(&inode->i_mutex); + if (err) return err; - } if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; ext4_mark_inode_dirty(handle, inode); } - mutex_unlock(&inode->i_mutex); return len; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 21687e3..b3d290c 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -454,7 +454,7 @@ static void fat_evict_inode(struct inode *inode) fat_truncate_blocks(inode, 0); } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); fat_cache_inval_inode(inode); fat_detach(inode); } @@ -532,9 +532,9 @@ static inline int sigio_perm(struct task_struct *p, rcu_read_lock(); cred = __task_cred(p); - ret = ((fown->euid == 0 || - fown->euid == cred->suid || fown->euid == cred->uid || - fown->uid == cred->suid || fown->uid == cred->uid) && + ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) || + uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) || + uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) && !security_file_send_sigiotask(p, fown, sig)); rcu_read_unlock(); return ret; diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index cf9ef91..ef67c95 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -355,6 +355,6 @@ void vxfs_evict_inode(struct inode *ip) { truncate_inode_pages(&ip->i_data, 0); - end_writeback(ip); + clear_inode(ip); call_rcu(&ip->i_rcu, vxfs_i_callback); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 539f36c..8d2fb8c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -231,11 +231,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) static void inode_sync_complete(struct inode *inode) { - /* - * Prevent speculative execution through - * spin_unlock(&wb->list_lock); - */ - + inode->i_state &= ~I_SYNC; + /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } @@ -329,10 +326,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Wait for writeback on an inode to complete. + * Wait for writeback on an inode to complete. Called with i_lock held. + * Caller must make sure inode cannot go away when we drop i_lock. */ -static void inode_wait_for_writeback(struct inode *inode, - struct bdi_writeback *wb) +static void __inode_wait_for_writeback(struct inode *inode) + __releases(inode->i_lock) + __acquires(inode->i_lock) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -340,70 +339,119 @@ static void inode_wait_for_writeback(struct inode *inode, wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under wb->list_lock and - * inode->i_lock. Either the caller has an active reference on the inode or - * the inode has I_WILL_FREE set. - * - * If `wait' is set, wait on the writeout. - * - * The whole writeout design is quite complex and fragile. We want to avoid - * starvation of particular inodes when others are being redirtied, prevent - * livelocks, etc. + * Wait for writeback on an inode to complete. Caller must have inode pinned. */ -static int -writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +void inode_wait_for_writeback(struct inode *inode) { - struct address_space *mapping = inode->i_mapping; - long nr_to_write = wbc->nr_to_write; - unsigned dirty; - int ret; + spin_lock(&inode->i_lock); + __inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); +} - assert_spin_locked(&wb->list_lock); - assert_spin_locked(&inode->i_lock); +/* + * Sleep until I_SYNC is cleared. This function must be called with i_lock + * held and drops it. It is aimed for callers not holding any inode reference + * so once i_lock is dropped, inode can go away. + */ +static void inode_sleep_on_writeback(struct inode *inode) + __releases(inode->i_lock) +{ + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); + int sleep; - if (!atomic_read(&inode->i_count)) - WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); - else - WARN_ON(inode->i_state & I_WILL_FREE); + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + sleep = inode->i_state & I_SYNC; + spin_unlock(&inode->i_lock); + if (sleep) + schedule(); + finish_wait(wqh, &wait); +} - if (inode->i_state & I_SYNC) { +/* + * Find proper writeback list for the inode depending on its current state and + * possibly also change of its state while we were doing writeback. Here we + * handle things such as livelock prevention or fairness of writeback among + * inodes. This function can be called only by flusher thread - noone else + * processes all inodes in writeback lists and requeueing inodes behind flusher + * thread's back can have unexpected consequences. + */ +static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + if (inode->i_state & I_FREEING) + return; + + /* + * Sync livelock prevention. Each inode is tagged and synced in one + * shot. If still dirty, it will be redirty_tail()'ed below. Update + * the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + + if (wbc->pages_skipped) { /* - * If this inode is locked for writeback and we are not doing - * writeback-for-data-integrity, move it to b_more_io so that - * writeback can proceed with the other inodes on s_io. - * - * We'll have another go at writing back this inode when we - * completed a full scan of b_io. + * writeback is not making progress due to locked + * buffers. Skip this inode for now. */ - if (wbc->sync_mode != WB_SYNC_ALL) { + redirty_tail(inode, wb); + return; + } + + if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { + /* + * We didn't write back all the pages. nfs_writepages() + * sometimes bales out without doing anything. + */ + if (wbc->nr_to_write <= 0) { + /* Slice used up. Queue for next turn. */ requeue_io(inode, wb); - trace_writeback_single_inode_requeue(inode, wbc, - nr_to_write); - return 0; + } else { + /* + * Writeback blocked by something other than + * congestion. Delay the inode for some time to + * avoid spinning on the CPU (100% iowait) + * retrying writeback of the dirty page/inode + * that cannot be performed immediately. + */ + redirty_tail(inode, wb); } - + } else if (inode->i_state & I_DIRTY) { /* - * It's a data-integrity sync. We must wait. + * Filesystems can dirty the inode during writeback operations, + * such as delayed allocation during submission or metadata + * updates after data IO completion. */ - inode_wait_for_writeback(inode, wb); + redirty_tail(inode, wb); + } else { + /* The inode is clean. Remove from writeback lists. */ + list_del_init(&inode->i_wb_list); } +} - BUG_ON(inode->i_state & I_SYNC); +/* + * Write out an inode and its dirty pages. Do not update the writeback list + * linkage. That is left to the caller. The caller is also responsible for + * setting I_SYNC flag and calling inode_sync_complete() to clear it. + */ +static int +__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + struct address_space *mapping = inode->i_mapping; + long nr_to_write = wbc->nr_to_write; + unsigned dirty; + int ret; - /* Set I_SYNC, reset I_DIRTY_PAGES */ - inode->i_state |= I_SYNC; - inode->i_state &= ~I_DIRTY_PAGES; - spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); + WARN_ON(!(inode->i_state & I_SYNC)); ret = do_writepages(mapping, wbc); @@ -424,6 +472,9 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * write_inode() */ spin_lock(&inode->i_lock); + /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ + if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + inode->i_state &= ~I_DIRTY_PAGES; dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); spin_unlock(&inode->i_lock); @@ -433,60 +484,67 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, if (ret == 0) ret = err; } + trace_writeback_single_inode(inode, wbc, nr_to_write); + return ret; +} + +/* + * Write out an inode's dirty pages. Either the caller has an active reference + * on the inode or the inode has I_WILL_FREE set. + * + * This function is designed to be called for writing back one inode which + * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() + * and does more profound writeback list handling in writeback_sb_inodes(). + */ +static int +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + int ret = 0; - spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - inode->i_state &= ~I_SYNC; - if (!(inode->i_state & I_FREEING)) { + if (!atomic_read(&inode->i_count)) + WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); + else + WARN_ON(inode->i_state & I_WILL_FREE); + + if (inode->i_state & I_SYNC) { + if (wbc->sync_mode != WB_SYNC_ALL) + goto out; /* - * Sync livelock prevention. Each inode is tagged and synced in - * one shot. If still dirty, it will be redirty_tail()'ed below. - * Update the dirty time to prevent enqueue and sync it again. + * It's a data-integrity sync. We must wait. Since callers hold + * inode reference or inode has I_WILL_FREE set, it cannot go + * away under us. */ - if ((inode->i_state & I_DIRTY) && - (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) - inode->dirtied_when = jiffies; - - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - /* - * We didn't write back all the pages. nfs_writepages() - * sometimes bales out without doing anything. - */ - inode->i_state |= I_DIRTY_PAGES; - if (wbc->nr_to_write <= 0) { - /* - * slice used up: queue for next turn - */ - requeue_io(inode, wb); - } else { - /* - * Writeback blocked by something other than - * congestion. Delay the inode for some time to - * avoid spinning on the CPU (100% iowait) - * retrying writeback of the dirty page/inode - * that cannot be performed immediately. - */ - redirty_tail(inode, wb); - } - } else if (inode->i_state & I_DIRTY) { - /* - * Filesystems can dirty the inode during writeback - * operations, such as delayed allocation during - * submission or metadata updates after data IO - * completion. - */ - redirty_tail(inode, wb); - } else { - /* - * The inode is clean. At this point we either have - * a reference to the inode or it's on it's way out. - * No need to add it back to the LRU. - */ - list_del_init(&inode->i_wb_list); - } + __inode_wait_for_writeback(inode); } + WARN_ON(inode->i_state & I_SYNC); + /* + * Skip inode if it is clean. We don't want to mess with writeback + * lists in this function since flusher thread may be doing for example + * sync in parallel and if we move the inode, it could get skipped. So + * here we make sure inode is on some writeback list and leave it there + * unless we have completely cleaned the inode. + */ + if (!(inode->i_state & I_DIRTY)) + goto out; + inode->i_state |= I_SYNC; + spin_unlock(&inode->i_lock); + + ret = __writeback_single_inode(inode, wb, wbc); + + spin_lock(&wb->list_lock); + spin_lock(&inode->i_lock); + /* + * If inode is clean, remove it from writeback lists. Otherwise don't + * touch it. See comment above for explanation. + */ + if (!(inode->i_state & I_DIRTY)) + list_del_init(&inode->i_wb_list); + spin_unlock(&wb->list_lock); inode_sync_complete(inode); - trace_writeback_single_inode(inode, wbc, nr_to_write); +out: + spin_unlock(&inode->i_lock); return ret; } @@ -580,29 +638,57 @@ static long writeback_sb_inodes(struct super_block *sb, redirty_tail(inode, wb); continue; } - __iget(inode); + if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { + /* + * If this inode is locked for writeback and we are not + * doing writeback-for-data-integrity, move it to + * b_more_io so that writeback can proceed with the + * other inodes on s_io. + * + * We'll have another go at writing back this inode + * when we completed a full scan of b_io. + */ + spin_unlock(&inode->i_lock); + requeue_io(inode, wb); + trace_writeback_sb_inodes_requeue(inode); + continue; + } + spin_unlock(&wb->list_lock); + + /* + * We already requeued the inode if it had I_SYNC set and we + * are doing WB_SYNC_NONE writeback. So this catches only the + * WB_SYNC_ALL case. + */ + if (inode->i_state & I_SYNC) { + /* Wait for I_SYNC. This function drops i_lock... */ + inode_sleep_on_writeback(inode); + /* Inode may be gone, start again */ + continue; + } + inode->i_state |= I_SYNC; + spin_unlock(&inode->i_lock); + write_chunk = writeback_chunk_size(wb->bdi, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; - writeback_single_inode(inode, wb, &wbc); + /* + * We use I_SYNC to pin the inode in memory. While it is set + * evict_inode() will wait so the inode cannot be freed. + */ + __writeback_single_inode(inode, wb, &wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; + spin_lock(&wb->list_lock); + spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY)) wrote++; - if (wbc.pages_skipped) { - /* - * writeback is not making progress due to locked - * buffers. Skip this inode for now. - */ - redirty_tail(inode, wb); - } + requeue_inode(inode, wb, &wbc); + inode_sync_complete(inode); spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); - iput(inode); - cond_resched(); - spin_lock(&wb->list_lock); + cond_resched_lock(&wb->list_lock); /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. @@ -796,8 +882,10 @@ static long wb_writeback(struct bdi_writeback *wb, trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); - inode_wait_for_writeback(inode, wb); - spin_unlock(&inode->i_lock); + spin_unlock(&wb->list_lock); + /* This function drops i_lock... */ + inode_sleep_on_writeback(inode); + spin_lock(&wb->list_lock); } } spin_unlock(&wb->list_lock); @@ -1331,7 +1419,6 @@ EXPORT_SYMBOL(sync_inodes_sb); int write_inode_now(struct inode *inode, int sync) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, @@ -1343,12 +1430,7 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&wb->list_lock); - spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wb, &wbc); - spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); - return ret; + return writeback_single_inode(inode, wb, &wbc); } EXPORT_SYMBOL(write_inode_now); @@ -1365,15 +1447,7 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - int ret; - - spin_lock(&wb->list_lock); - spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wb, wbc); - spin_unlock(&inode->i_lock); - spin_unlock(&wb->list_lock); - return ret; + return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); } EXPORT_SYMBOL(sync_inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 26783eb..56f6dcf 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -122,7 +122,7 @@ static void fuse_destroy_inode(struct inode *inode) static void fuse_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (inode->i_sb->s_flags & MS_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 230eb0f..bd4a589 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -73,12 +73,8 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode) int error = 0; if (mode != inode->i_mode) { - struct iattr iattr; - - iattr.ia_valid = ATTR_MODE; - iattr.ia_mode = mode; - - error = gfs2_setattr_simple(inode, &iattr); + inode->i_mode = mode; + mark_inode_dirty(inode); } return error; @@ -126,9 +122,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) return PTR_ERR(acl); if (!acl) { mode &= ~current_umask(); - if (mode != inode->i_mode) - error = gfs2_set_mode(inode, mode); - return error; + return gfs2_set_mode(inode, mode); } if (S_ISDIR(inode->i_mode)) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9b2ff0e..e80a464 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -36,8 +36,8 @@ #include "glops.h" -void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int to) +static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to) { struct buffer_head *head = page_buffers(page); unsigned int bsize = head->b_size; @@ -517,15 +517,14 @@ out: /** * gfs2_internal_read - read an internal file * @ip: The gfs2 inode - * @ra_state: The readahead state (or NULL for no readahead) * @buf: The buffer to fill * @pos: The file position * @size: The amount to read * */ -int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size) +int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, + unsigned size) { struct address_space *mapping = ip->i_inode.i_mapping; unsigned long index = *pos / PAGE_CACHE_SIZE; @@ -943,8 +942,8 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) clear_buffer_dirty(bh); bd = bh->b_private; if (bd) { - if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh)) - list_del_init(&bd->bd_le.le_list); + if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) + list_del_init(&bd->bd_list); else gfs2_remove_from_journal(bh, current->journal_info, 0); } @@ -1084,10 +1083,9 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bd = bh->b_private; if (bd) { gfs2_assert_warn(sdp, bd->bd_bh == bh); - gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); - if (!list_empty(&bd->bd_le.le_list)) { + if (!list_empty(&bd->bd_list)) { if (!buffer_pinned(bh)) - list_del_init(&bd->bd_le.le_list); + list_del_init(&bd->bd_list); else bd = NULL; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 03c04fe..dab5409 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -324,7 +324,7 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) if (!dblock) return x + 1; - ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]); + ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]); if (ret) return ret; } @@ -882,7 +882,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; } else { - error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); + error = gfs2_meta_indirect_buffer(ip, height, block, &bh); if (error) return error; @@ -1169,6 +1169,7 @@ static int do_grow(struct inode *inode, u64 size) struct buffer_head *dibh; struct gfs2_qadata *qa = NULL; int error; + int unstuff = 0; if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { @@ -1183,13 +1184,14 @@ static int do_grow(struct inode *inode, u64 size) error = gfs2_inplace_reserve(ip, 1); if (error) goto do_grow_qunlock; + unstuff = 1; } error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); if (error) goto do_grow_release; - if (qa) { + if (unstuff) { error = gfs2_unstuff_dinode(ip, NULL); if (error) goto do_end_trans; @@ -1208,7 +1210,7 @@ static int do_grow(struct inode *inode, u64 size) do_end_trans: gfs2_trans_end(sdp); do_grow_release: - if (qa) { + if (unstuff) { gfs2_inplace_release(ip); do_grow_qunlock: gfs2_quota_unlock(ip); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index a836056..8aaeb07 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -821,7 +821,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct buffer_head *bh; struct gfs2_leaf *leaf; struct gfs2_dirent *dent; - struct qstr name = { .name = "", .len = 0, .hash = 0 }; + struct qstr name = { .name = "" }; error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a3d2c9e..31b199f 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -558,14 +558,14 @@ fail: } /** - * gfs2_close - called to close a struct file + * gfs2_release - called to close a struct file * @inode: the inode the struct file belongs to * @file: the struct file being closed * * Returns: errno */ -static int gfs2_close(struct inode *inode, struct file *file) +static int gfs2_release(struct inode *inode, struct file *file) { struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; struct gfs2_file *fp; @@ -1005,7 +1005,7 @@ const struct file_operations gfs2_file_fops = { .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, - .release = gfs2_close, + .release = gfs2_release, .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, @@ -1019,7 +1019,7 @@ const struct file_operations gfs2_dir_fops = { .readdir = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, - .release = gfs2_close, + .release = gfs2_release, .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, @@ -1037,7 +1037,7 @@ const struct file_operations gfs2_file_fops_nolock = { .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, - .release = gfs2_close, + .release = gfs2_release, .fsync = gfs2_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, @@ -1049,7 +1049,7 @@ const struct file_operations gfs2_dir_fops_nolock = { .readdir = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .open = gfs2_open, - .release = gfs2_close, + .release = gfs2_release, .fsync = gfs2_fsync, .llseek = default_llseek, }; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 1656df7a..4bdcf37 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -94,7 +94,6 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) /* A shortened, inline version of gfs2_trans_begin() */ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); tr.tr_ip = (unsigned long)__builtin_return_address(0); - INIT_LIST_HEAD(&tr.tr_list_buf); gfs2_log_reserve(sdp, tr.tr_reserved); BUG_ON(current->journal_info); current->journal_info = &tr; @@ -379,11 +378,6 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) if (error) return error; - if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) { - brelse(dibh); - return -EIO; - } - error = gfs2_dinode_in(ip, dibh->b_data); brelse(dibh); clear_bit(GIF_INVALID, &ip->i_flags); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 47d0bda..67fd6be 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -26,7 +26,7 @@ #define DIO_METADATA 0x00000020 struct gfs2_log_operations; -struct gfs2_log_element; +struct gfs2_bufdata; struct gfs2_holder; struct gfs2_glock; struct gfs2_quota_data; @@ -52,7 +52,7 @@ struct gfs2_log_header_host { */ struct gfs2_log_operations { - void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le); + void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); void (*lo_before_commit) (struct gfs2_sbd *sdp); void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); void (*lo_before_scan) (struct gfs2_jdesc *jd, @@ -64,11 +64,6 @@ struct gfs2_log_operations { const char *lo_name; }; -struct gfs2_log_element { - struct list_head le_list; - const struct gfs2_log_operations *le_ops; -}; - #define GBF_FULL 1 struct gfs2_bitmap { @@ -118,15 +113,10 @@ TAS_BUFFER_FNS(Zeronew, zeronew) struct gfs2_bufdata { struct buffer_head *bd_bh; struct gfs2_glock *bd_gl; + u64 bd_blkno; - union { - struct list_head list_tr; - u64 blkno; - } u; -#define bd_list_tr u.list_tr -#define bd_blkno u.blkno - - struct gfs2_log_element bd_le; + struct list_head bd_list; + const struct gfs2_log_operations *bd_ops; struct gfs2_ail *bd_ail; struct list_head bd_ail_st_list; @@ -411,13 +401,10 @@ struct gfs2_trans { int tr_touched; - unsigned int tr_num_buf; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; unsigned int tr_num_buf_rm; unsigned int tr_num_databuf_rm; - struct list_head tr_list_buf; - unsigned int tr_num_revoke; unsigned int tr_num_revoke_rm; }; @@ -556,7 +543,6 @@ struct gfs2_sb_host { struct lm_lockstruct { int ls_jid; unsigned int ls_first; - unsigned int ls_nodir; const struct lm_lockops *ls_ops; dlm_lockspace_t *ls_dlm; @@ -699,7 +685,6 @@ struct gfs2_sbd { struct list_head sd_log_le_buf; struct list_head sd_log_le_revoke; - struct list_head sd_log_le_rg; struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; @@ -716,7 +701,9 @@ struct gfs2_sbd { struct rw_semaphore sd_log_flush_lock; atomic_t sd_log_in_flight; + struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; + int sd_log_error; unsigned int sd_log_flush_head; u64 sd_log_flush_wrapped; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 276e7b5..c53c747 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -17,10 +17,7 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); extern int gfs2_internal_read(struct gfs2_inode *ip, - struct file_ra_state *ra_state, char *buf, loff_t *pos, unsigned size); -extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int to); extern void gfs2_set_aops(struct inode *inode); static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 5f5e70e..4a38db7 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1209,8 +1209,6 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) fsname++; flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL; - if (ls->ls_nodir) - flags |= DLM_LSFL_NODIR; /* * create/join lockspace diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 4752ead..f4beeb9 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -32,8 +32,6 @@ #include "dir.h" #include "trace_gfs2.h" -#define PULL 1 - /** * gfs2_struct2blk - compute stuff * @sdp: the filesystem @@ -359,18 +357,6 @@ retry: return 0; } -u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) -{ - struct gfs2_journal_extent *je; - - list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { - if (lbn >= je->lblock && lbn < je->lblock + je->blocks) - return je->dblock + lbn - je->lblock; - } - - return -1; -} - /** * log_distance - Compute distance between two journal blocks * @sdp: The GFS2 superblock @@ -466,17 +452,6 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) return tail; } -void gfs2_log_incr_head(struct gfs2_sbd *sdp) -{ - BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && - (sdp->sd_log_flush_head != sdp->sd_log_head)); - - if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { - sdp->sd_log_flush_head = 0; - sdp->sd_log_flush_wrapped = 1; - } -} - static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) { unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); @@ -511,8 +486,8 @@ static int bd_cmp(void *priv, struct list_head *a, struct list_head *b) { struct gfs2_bufdata *bda, *bdb; - bda = list_entry(a, struct gfs2_bufdata, bd_le.le_list); - bdb = list_entry(b, struct gfs2_bufdata, bd_le.le_list); + bda = list_entry(a, struct gfs2_bufdata, bd_list); + bdb = list_entry(b, struct gfs2_bufdata, bd_list); if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) return -1; @@ -530,8 +505,8 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) gfs2_log_lock(sdp); list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp); while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list); - list_move(&bd->bd_le.le_list, &written); + bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list); + list_move(&bd->bd_list, &written); bh = bd->bd_bh; if (!buffer_dirty(bh)) continue; @@ -558,7 +533,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) gfs2_log_lock(sdp); while (!list_empty(&sdp->sd_log_le_ordered)) { - bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list); + bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list); bh = bd->bd_bh; if (buffer_locked(bh)) { get_bh(bh); @@ -568,7 +543,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) gfs2_log_lock(sdp); continue; } - list_del_init(&bd->bd_le.le_list); + list_del_init(&bd->bd_list); } gfs2_log_unlock(sdp); } @@ -580,25 +555,19 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) * Returns: the initialized log buffer descriptor */ -static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) +static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { - u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; struct gfs2_log_header *lh; unsigned int tail; u32 hash; - - bh = sb_getblk(sdp->sd_vfs, blkno); - lock_buffer(bh); - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); + int rw = WRITE_FLUSH_FUA | REQ_META; + struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + lh = page_address(page); + clear_page(lh); gfs2_ail1_empty(sdp); tail = current_tail(sdp); - lh = (struct gfs2_log_header *)bh->b_data; - memset(lh, 0, sizeof(struct gfs2_log_header)); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); lh->lh_header.__pad0 = cpu_to_be64(0); @@ -608,31 +577,22 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) lh->lh_flags = cpu_to_be32(flags); lh->lh_tail = cpu_to_be32(tail); lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); - hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); + hash = gfs2_disk_hash(page_address(page), sizeof(struct gfs2_log_header)); lh->lh_hash = cpu_to_be32(hash); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); - submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); - } else { - submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); + rw = WRITE_SYNC | REQ_META | REQ_PRIO; } - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - gfs2_io_error_bh(sdp, bh); - brelse(bh); + sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); + gfs2_log_write_page(sdp, page); + gfs2_log_flush_bio(sdp, rw); + log_flush_wait(sdp); if (sdp->sd_log_tail != tail) log_pull_tail(sdp, tail); - else - gfs2_assert_withdraw(sdp, !pull); - - sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); - gfs2_log_incr_head(sdp); } /** @@ -678,15 +638,14 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) gfs2_ordered_write(sdp); lops_before_commit(sdp); + gfs2_log_flush_bio(sdp, WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { - log_write_header(sdp, 0, 0); + log_write_header(sdp, 0); } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ - gfs2_log_lock(sdp); atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); - gfs2_log_unlock(sdp); - log_write_header(sdp, 0, PULL); + log_write_header(sdp, 0); } lops_after_commit(sdp, ai); @@ -735,21 +694,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_log_unlock(sdp); } -static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) -{ - struct list_head *head = &tr->tr_list_buf; - struct gfs2_bufdata *bd; - - gfs2_log_lock(sdp); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); - list_del_init(&bd->bd_list_tr); - tr->tr_num_buf--; - } - gfs2_log_unlock(sdp); - gfs2_assert_warn(sdp, !tr->tr_num_buf); -} - /** * gfs2_log_commit - Commit a transaction to the log * @sdp: the filesystem @@ -768,8 +712,6 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); - buf_lo_incore_commit(sdp, tr); - up_read(&sdp->sd_log_flush_lock); if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || @@ -798,8 +740,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) sdp->sd_log_flush_head = sdp->sd_log_head; sdp->sd_log_flush_wrapped = 0; - log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, - (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL); + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); @@ -854,11 +795,9 @@ int gfs2_logd(void *data) struct gfs2_sbd *sdp = data; unsigned long t = 1; DEFINE_WAIT(wait); - unsigned preflush; while (!kthread_should_stop()) { - preflush = atomic_read(&sdp->sd_log_pinned); if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); gfs2_log_flush(sdp, NULL); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index ff07454..3fd5215 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -52,8 +52,6 @@ extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); -extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn); extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 6b1efb5..852c1be 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -127,146 +127,277 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, atomic_dec(&sdp->sd_log_pinned); } - -static inline struct gfs2_log_descriptor *bh_log_desc(struct buffer_head *bh) +static void gfs2_log_incr_head(struct gfs2_sbd *sdp) { - return (struct gfs2_log_descriptor *)bh->b_data; + BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && + (sdp->sd_log_flush_head != sdp->sd_log_head)); + + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + sdp->sd_log_flush_head = 0; + sdp->sd_log_flush_wrapped = 1; + } } -static inline __be64 *bh_log_ptr(struct buffer_head *bh) +static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) { - struct gfs2_log_descriptor *ld = bh_log_desc(bh); - return (__force __be64 *)(ld + 1); + unsigned int lbn = sdp->sd_log_flush_head; + struct gfs2_journal_extent *je; + u64 block; + + list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { + if (lbn >= je->lblock && lbn < je->lblock + je->blocks) { + block = je->dblock + lbn - je->lblock; + gfs2_log_incr_head(sdp); + return block; + } + } + + return -1; } -static inline __be64 *bh_ptr_end(struct buffer_head *bh) +/** + * gfs2_end_log_write_bh - end log write of pagecache data with buffers + * @sdp: The superblock + * @bvec: The bio_vec + * @error: The i/o status + * + * This finds the relavent buffers and unlocks then and sets the + * error flag according to the status of the i/o request. This is + * used when the log is writing data which has an in-place version + * that is pinned in the pagecache. + */ + +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, + int error) { - return (__force __be64 *)(bh->b_data + bh->b_size); + struct buffer_head *bh, *next; + struct page *page = bvec->bv_page; + unsigned size; + + bh = page_buffers(page); + size = bvec->bv_len; + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + if (error) + set_buffer_write_io_error(bh); + unlock_buffer(bh); + next = bh->b_this_page; + size -= bh->b_size; + brelse(bh); + bh = next; + } while(bh && size); } /** - * gfs2_log_write_endio - End of I/O for a log buffer - * @bh: The buffer head - * @uptodate: I/O Status + * gfs2_end_log_write - end of i/o to the log + * @bio: The bio + * @error: Status of i/o request + * + * Each bio_vec contains either data from the pagecache or data + * relating to the log itself. Here we iterate over the bio_vec + * array, processing both kinds of data. * */ -static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate) +static void gfs2_end_log_write(struct bio *bio, int error) { - struct gfs2_sbd *sdp = bh->b_private; - bh->b_private = NULL; + struct gfs2_sbd *sdp = bio->bi_private; + struct bio_vec *bvec; + struct page *page; + int i; - end_buffer_write_sync(bh, uptodate); + if (error) { + sdp->sd_log_error = error; + fs_err(sdp, "Error %d writing to log\n", error); + } + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page_has_buffers(page)) + gfs2_end_log_write_bh(sdp, bvec, error); + else + mempool_free(page, gfs2_page_pool); + } + + bio_put(bio); if (atomic_dec_and_test(&sdp->sd_log_in_flight)) wake_up(&sdp->sd_log_flush_wait); } /** - * gfs2_log_get_buf - Get and initialize a buffer to use for log control data - * @sdp: The GFS2 superblock + * gfs2_log_flush_bio - Submit any pending log bio + * @sdp: The superblock + * @rw: The rw flags * - * tReturns: the buffer_head + * Submit any pending part-built or full bio to the block device. If + * there is no pending bio, then this is a no-op. */ -static struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) +void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) { - u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; + if (sdp->sd_log_bio) { + atomic_inc(&sdp->sd_log_in_flight); + submit_bio(rw, sdp->sd_log_bio); + sdp->sd_log_bio = NULL; + } +} - bh = sb_getblk(sdp->sd_vfs, blkno); - lock_buffer(bh); - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - gfs2_log_incr_head(sdp); - atomic_inc(&sdp->sd_log_in_flight); - bh->b_private = sdp; - bh->b_end_io = gfs2_log_write_endio; +/** + * gfs2_log_alloc_bio - Allocate a new bio for log writing + * @sdp: The superblock + * @blkno: The next device block number we want to write to + * + * This should never be called when there is a cached bio in the + * super block. When it returns, there will be a cached bio in the + * super block which will have as many bio_vecs as the device is + * happy to handle. + * + * Returns: Newly allocated bio + */ - return bh; +static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) +{ + struct super_block *sb = sdp->sd_vfs; + unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev); + struct bio *bio; + + BUG_ON(sdp->sd_log_bio); + + while (1) { + bio = bio_alloc(GFP_NOIO, nrvecs); + if (likely(bio)) + break; + nrvecs = max(nrvecs/2, 1U); + } + + bio->bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_bdev = sb->s_bdev; + bio->bi_end_io = gfs2_end_log_write; + bio->bi_private = sdp; + + sdp->sd_log_bio = bio; + + return bio; } /** - * gfs2_fake_write_endio - - * @bh: The buffer head - * @uptodate: The I/O Status + * gfs2_log_get_bio - Get cached log bio, or allocate a new one + * @sdp: The superblock + * @blkno: The device block number we want to write to + * + * If there is a cached bio, then if the next block number is sequential + * with the previous one, return it, otherwise flush the bio to the + * device. If there is not a cached bio, or we just flushed it, then + * allocate a new one. * + * Returns: The bio to use for log writes */ -static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate) +static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) { - struct buffer_head *real_bh = bh->b_private; - struct gfs2_bufdata *bd = real_bh->b_private; - struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd; + struct bio *bio = sdp->sd_log_bio; + u64 nblk; + + if (bio) { + nblk = bio->bi_sector + bio_sectors(bio); + nblk >>= sdp->sd_fsb2bb_shift; + if (blkno == nblk) + return bio; + gfs2_log_flush_bio(sdp, WRITE); + } - end_buffer_write_sync(bh, uptodate); - mempool_free(bh, gfs2_bh_pool); - unlock_buffer(real_bh); - brelse(real_bh); - if (atomic_dec_and_test(&sdp->sd_log_in_flight)) - wake_up(&sdp->sd_log_flush_wait); + return gfs2_log_alloc_bio(sdp, blkno); } + /** - * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log + * gfs2_log_write - write to log * @sdp: the filesystem - * @data: the data the buffer_head should point to + * @page: the page to write + * @size: the size of the data to write + * @offset: the offset within the page * - * Returns: the log buffer descriptor + * Try and add the page segment to the current bio. If that fails, + * submit the current bio to the device and create a new one, and + * then add the page segment to that. */ -static struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, - struct buffer_head *real) +static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, + unsigned size, unsigned offset) { - u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; + u64 blkno = gfs2_log_bmap(sdp); + struct bio *bio; + int ret; + + bio = gfs2_log_get_bio(sdp, blkno); + ret = bio_add_page(bio, page, size, offset); + if (ret == 0) { + gfs2_log_flush_bio(sdp, WRITE); + bio = gfs2_log_alloc_bio(sdp, blkno); + ret = bio_add_page(bio, page, size, offset); + WARN_ON(ret == 0); + } +} + +/** + * gfs2_log_write_bh - write a buffer's content to the log + * @sdp: The super block + * @bh: The buffer pointing to the in-place location + * + * This writes the content of the buffer to the next available location + * in the log. The buffer will be unlocked once the i/o to the log has + * completed. + */ - bh = mempool_alloc(gfs2_bh_pool, GFP_NOFS); - atomic_set(&bh->b_count, 1); - bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock); - set_bh_page(bh, real->b_page, bh_offset(real)); - bh->b_blocknr = blkno; - bh->b_size = sdp->sd_sb.sb_bsize; - bh->b_bdev = sdp->sd_vfs->s_bdev; - bh->b_private = real; - bh->b_end_io = gfs2_fake_write_endio; +static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh)); +} - gfs2_log_incr_head(sdp); - atomic_inc(&sdp->sd_log_in_flight); +/** + * gfs2_log_write_page - write one block stored in a page, into the log + * @sdp: The superblock + * @page: The struct page + * + * This writes the first block-sized part of the page into the log. Note + * that the page must have been allocated from the gfs2_page_pool mempool + * and that after this has been called, ownership has been transferred and + * the page may be freed at any time. + */ - return bh; +void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) +{ + struct super_block *sb = sdp->sd_vfs; + gfs2_log_write(sdp, page, sb->s_blocksize, 0); } -static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type) +static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, + u32 ld_length, u32 ld_data1) { - struct buffer_head *bh = gfs2_log_get_buf(sdp); - struct gfs2_log_descriptor *ld = bh_log_desc(bh); + struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + struct gfs2_log_descriptor *ld = page_address(page); + clear_page(ld); ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); ld->ld_type = cpu_to_be32(ld_type); - ld->ld_length = 0; - ld->ld_data1 = 0; + ld->ld_length = cpu_to_be32(ld_length); + ld->ld_data1 = cpu_to_be32(ld_data1); ld->ld_data2 = 0; - memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); - return bh; + return page; } -static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); struct gfs2_meta_header *mh; struct gfs2_trans *tr; lock_buffer(bd->bd_bh); gfs2_log_lock(sdp); - if (!list_empty(&bd->bd_list_tr)) - goto out; tr = current->journal_info; tr->tr_touched = 1; - tr->tr_num_buf++; - list_add(&bd->bd_list_tr, &tr->tr_list_buf); - if (!list_empty(&le->le_list)) + if (!list_empty(&bd->bd_list)) goto out; set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); @@ -276,62 +407,86 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) mh->__pad0 = cpu_to_be64(0); mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); sdp->sd_log_num_buf++; - list_add(&le->le_list, &sdp->sd_log_le_buf); + list_add(&bd->bd_list, &sdp->sd_log_le_buf); tr->tr_num_buf_new++; out: gfs2_log_unlock(sdp); unlock_buffer(bd->bd_bh); } -static void buf_lo_before_commit(struct gfs2_sbd *sdp) +static void gfs2_check_magic(struct buffer_head *bh) +{ + void *kaddr; + __be32 *ptr; + + clear_buffer_escaped(bh); + kaddr = kmap_atomic(bh->b_page); + ptr = kaddr + bh_offset(bh); + if (*ptr == cpu_to_be32(GFS2_MAGIC)) + set_buffer_escaped(bh); + kunmap_atomic(kaddr); +} + +static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, + unsigned int total, struct list_head *blist, + bool is_databuf) { - struct buffer_head *bh; struct gfs2_log_descriptor *ld; struct gfs2_bufdata *bd1 = NULL, *bd2; - unsigned int total; - unsigned int limit; + struct page *page; unsigned int num; unsigned n; __be64 *ptr; - limit = buf_limit(sdp); - /* for 4k blocks, limit = 503 */ - gfs2_log_lock(sdp); - total = sdp->sd_log_num_buf; - bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); + bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list); while(total) { num = total; if (total > limit) num = limit; gfs2_log_unlock(sdp); - bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA); + page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num); + ld = page_address(page); gfs2_log_lock(sdp); - ld = bh_log_desc(bh); - ptr = bh_log_ptr(bh); - ld->ld_length = cpu_to_be32(num + 1); - ld->ld_data1 = cpu_to_be32(num); + ptr = (__be64 *)(ld + 1); n = 0; - list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf, - bd_le.le_list) { + list_for_each_entry_continue(bd1, blist, bd_list) { *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + if (is_databuf) { + gfs2_check_magic(bd1->bd_bh); + *ptr++ = cpu_to_be64(buffer_escaped(bd1->bd_bh) ? 1 : 0); + } if (++n >= num) break; } gfs2_log_unlock(sdp); - submit_bh(WRITE_SYNC, bh); + gfs2_log_write_page(sdp, page); gfs2_log_lock(sdp); n = 0; - list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf, - bd_le.le_list) { + list_for_each_entry_continue(bd2, blist, bd_list) { get_bh(bd2->bd_bh); gfs2_log_unlock(sdp); lock_buffer(bd2->bd_bh); - bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); - submit_bh(WRITE_SYNC, bh); + + if (buffer_escaped(bd2->bd_bh)) { + void *kaddr; + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + ptr = page_address(page); + kaddr = kmap_atomic(bd2->bd_bh->b_page); + memcpy(ptr, kaddr + bh_offset(bd2->bd_bh), + bd2->bd_bh->b_size); + kunmap_atomic(kaddr); + *(__be32 *)ptr = 0; + clear_buffer_escaped(bd2->bd_bh); + unlock_buffer(bd2->bd_bh); + brelse(bd2->bd_bh); + gfs2_log_write_page(sdp, page); + } else { + gfs2_log_write_bh(sdp, bd2->bd_bh); + } gfs2_log_lock(sdp); if (++n >= num) break; @@ -343,14 +498,22 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); } +static void buf_lo_before_commit(struct gfs2_sbd *sdp) +{ + unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ + + gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf, + &sdp->sd_log_le_buf, 0); +} + static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) { struct list_head *head = &sdp->sd_log_le_buf; struct gfs2_bufdata *bd; while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); - list_del_init(&bd->bd_le.le_list); + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); sdp->sd_log_num_buf--; gfs2_unpin(sdp, bd->bd_bh, ai); @@ -437,9 +600,8 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); } -static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); struct gfs2_glock *gl = bd->bd_gl; struct gfs2_trans *tr; @@ -449,48 +611,48 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) sdp->sd_log_num_revoke++; atomic_inc(&gl->gl_revokes); set_bit(GLF_LFLUSH, &gl->gl_flags); - list_add(&le->le_list, &sdp->sd_log_le_revoke); + list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } static void revoke_lo_before_commit(struct gfs2_sbd *sdp) { struct gfs2_log_descriptor *ld; struct gfs2_meta_header *mh; - struct buffer_head *bh; unsigned int offset; struct list_head *head = &sdp->sd_log_le_revoke; struct gfs2_bufdata *bd; + struct page *page; + unsigned int length; if (!sdp->sd_log_num_revoke) return; - bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE); - ld = bh_log_desc(bh); - ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, - sizeof(u64))); - ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke); + length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); + page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); + ld = page_address(page); offset = sizeof(struct gfs2_log_descriptor); - list_for_each_entry(bd, head, bd_le.le_list) { + list_for_each_entry(bd, head, bd_list) { sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { - submit_bh(WRITE_SYNC, bh); - bh = gfs2_log_get_buf(sdp); - mh = (struct gfs2_meta_header *)bh->b_data; + gfs2_log_write_page(sdp, page); + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + mh = page_address(page); + clear_page(mh); mh->mh_magic = cpu_to_be32(GFS2_MAGIC); mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB); mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB); offset = sizeof(struct gfs2_meta_header); } - *(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno); + *(__be64 *)(page_address(page) + offset) = cpu_to_be64(bd->bd_blkno); offset += sizeof(u64); } gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - submit_bh(WRITE_SYNC, bh); + gfs2_log_write_page(sdp, page); } static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) @@ -500,8 +662,8 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) struct gfs2_glock *gl; while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); - list_del_init(&bd->bd_le.le_list); + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); gl = bd->bd_gl; atomic_dec(&gl->gl_revokes); clear_bit(GLF_LFLUSH, &gl->gl_flags); @@ -604,108 +766,33 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) * blocks, which isn't an enormous overhead but twice as much as * for normal metadata blocks. */ -static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); struct gfs2_trans *tr = current->journal_info; struct address_space *mapping = bd->bd_bh->b_page->mapping; struct gfs2_inode *ip = GFS2_I(mapping->host); lock_buffer(bd->bd_bh); gfs2_log_lock(sdp); - if (tr) { - if (!list_empty(&bd->bd_list_tr)) - goto out; + if (tr) tr->tr_touched = 1; - if (gfs2_is_jdata(ip)) { - tr->tr_num_buf++; - list_add(&bd->bd_list_tr, &tr->tr_list_buf); - } - } - if (!list_empty(&le->le_list)) + if (!list_empty(&bd->bd_list)) goto out; - set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); if (gfs2_is_jdata(ip)) { gfs2_pin(sdp, bd->bd_bh); tr->tr_num_databuf_new++; sdp->sd_log_num_databuf++; - list_add_tail(&le->le_list, &sdp->sd_log_le_databuf); + list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); } else { - list_add_tail(&le->le_list, &sdp->sd_log_le_ordered); + list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); } out: gfs2_log_unlock(sdp); unlock_buffer(bd->bd_bh); } -static void gfs2_check_magic(struct buffer_head *bh) -{ - void *kaddr; - __be32 *ptr; - - clear_buffer_escaped(bh); - kaddr = kmap_atomic(bh->b_page); - ptr = kaddr + bh_offset(bh); - if (*ptr == cpu_to_be32(GFS2_MAGIC)) - set_buffer_escaped(bh); - kunmap_atomic(kaddr); -} - -static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, - struct list_head *list, struct list_head *done, - unsigned int n) -{ - struct buffer_head *bh1; - struct gfs2_log_descriptor *ld; - struct gfs2_bufdata *bd; - __be64 *ptr; - - if (!bh) - return; - - ld = bh_log_desc(bh); - ld->ld_length = cpu_to_be32(n + 1); - ld->ld_data1 = cpu_to_be32(n); - - ptr = bh_log_ptr(bh); - - get_bh(bh); - submit_bh(WRITE_SYNC, bh); - gfs2_log_lock(sdp); - while(!list_empty(list)) { - bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); - list_move_tail(&bd->bd_le.le_list, done); - get_bh(bd->bd_bh); - while (be64_to_cpu(*ptr) != bd->bd_bh->b_blocknr) { - gfs2_log_incr_head(sdp); - ptr += 2; - } - gfs2_log_unlock(sdp); - lock_buffer(bd->bd_bh); - if (buffer_escaped(bd->bd_bh)) { - void *kaddr; - bh1 = gfs2_log_get_buf(sdp); - kaddr = kmap_atomic(bd->bd_bh->b_page); - memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh), - bh1->b_size); - kunmap_atomic(kaddr); - *(__be32 *)bh1->b_data = 0; - clear_buffer_escaped(bd->bd_bh); - unlock_buffer(bd->bd_bh); - brelse(bd->bd_bh); - } else { - bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); - } - submit_bh(WRITE_SYNC, bh1); - gfs2_log_lock(sdp); - ptr += 2; - } - gfs2_log_unlock(sdp); - brelse(bh); -} - /** * databuf_lo_before_commit - Scan the data buffers, writing as we go * @@ -713,37 +800,10 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, static void databuf_lo_before_commit(struct gfs2_sbd *sdp) { - struct gfs2_bufdata *bd = NULL; - struct buffer_head *bh = NULL; - unsigned int n = 0; - __be64 *ptr = NULL, *end = NULL; - LIST_HEAD(processed); - LIST_HEAD(in_progress); + unsigned int limit = buf_limit(sdp) / 2; - gfs2_log_lock(sdp); - while (!list_empty(&sdp->sd_log_le_databuf)) { - if (ptr == end) { - gfs2_log_unlock(sdp); - gfs2_write_blocks(sdp, bh, &in_progress, &processed, n); - n = 0; - bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_JDATA); - ptr = bh_log_ptr(bh); - end = bh_ptr_end(bh) - 1; - gfs2_log_lock(sdp); - continue; - } - bd = list_entry(sdp->sd_log_le_databuf.next, struct gfs2_bufdata, bd_le.le_list); - list_move_tail(&bd->bd_le.le_list, &in_progress); - gfs2_check_magic(bd->bd_bh); - *ptr++ = cpu_to_be64(bd->bd_bh->b_blocknr); - *ptr++ = cpu_to_be64(buffer_escaped(bh) ? 1 : 0); - n++; - } - gfs2_log_unlock(sdp); - gfs2_write_blocks(sdp, bh, &in_progress, &processed, n); - gfs2_log_lock(sdp); - list_splice(&processed, &sdp->sd_log_le_databuf); - gfs2_log_unlock(sdp); + gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf, + &sdp->sd_log_le_databuf, 1); } static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -822,8 +882,8 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) struct gfs2_bufdata *bd; while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); - list_del_init(&bd->bd_le.le_list); + bd = list_entry(head->next, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); sdp->sd_log_num_databuf--; gfs2_unpin(sdp, bd->bd_bh, ai); } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 3c0b273..954a330 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,6 +27,8 @@ extern const struct gfs2_log_operations gfs2_rg_lops; extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; +extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); +extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { @@ -44,17 +46,17 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) return limit; } -static inline void lops_init_le(struct gfs2_log_element *le, +static inline void lops_init_le(struct gfs2_bufdata *bd, const struct gfs2_log_operations *lops) { - INIT_LIST_HEAD(&le->le_list); - le->le_ops = lops; + INIT_LIST_HEAD(&bd->bd_list); + bd->bd_ops = lops; } -static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - if (le->le_ops->lo_add) - le->le_ops->lo_add(sdp, le); + if (bd->bd_ops->lo_add) + bd->bd_ops->lo_add(sdp, bd); } static inline void lops_before_commit(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 754426b..6cdb0f2 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -70,16 +70,6 @@ static void gfs2_init_gl_aspace_once(void *foo) address_space_init_once(mapping); } -static void *gfs2_bh_alloc(gfp_t mask, void *data) -{ - return alloc_buffer_head(mask); -} - -static void gfs2_bh_free(void *ptr, void *data) -{ - return free_buffer_head(ptr); -} - /** * init_gfs2_fs - Register GFS2 as a filesystem * @@ -143,6 +133,12 @@ static int __init init_gfs2_fs(void) if (!gfs2_quotad_cachep) goto fail; + gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk", + sizeof(struct gfs2_blkreserv), + 0, 0, NULL); + if (!gfs2_rsrv_cachep) + goto fail; + register_shrinker(&qd_shrinker); error = register_filesystem(&gfs2_fs_type); @@ -164,8 +160,8 @@ static int __init init_gfs2_fs(void) if (!gfs2_control_wq) goto fail_recovery; - gfs2_bh_pool = mempool_create(1024, gfs2_bh_alloc, gfs2_bh_free, NULL); - if (!gfs2_bh_pool) + gfs2_page_pool = mempool_create_page_pool(64, 0); + if (!gfs2_page_pool) goto fail_control; gfs2_register_debugfs(); @@ -186,6 +182,9 @@ fail: unregister_shrinker(&qd_shrinker); gfs2_glock_exit(); + if (gfs2_rsrv_cachep) + kmem_cache_destroy(gfs2_rsrv_cachep); + if (gfs2_quotad_cachep) kmem_cache_destroy(gfs2_quotad_cachep); @@ -225,7 +224,8 @@ static void __exit exit_gfs2_fs(void) rcu_barrier(); - mempool_destroy(gfs2_bh_pool); + mempool_destroy(gfs2_page_pool); + kmem_cache_destroy(gfs2_rsrv_cachep); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 181586e..6c1e5d1 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -293,11 +293,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, bd->bd_bh = bh; bd->bd_gl = gl; - INIT_LIST_HEAD(&bd->bd_list_tr); if (meta) - lops_init_le(&bd->bd_le, &gfs2_buf_lops); + lops_init_le(bd, &gfs2_buf_lops); else - lops_init_le(&bd->bd_le, &gfs2_databuf_lops); + lops_init_le(bd, &gfs2_databuf_lops); bh->b_private = bd; if (meta) @@ -313,7 +312,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); atomic_dec(&sdp->sd_log_pinned); - list_del_init(&bd->bd_le.le_list); + list_del_init(&bd->bd_list); if (meta) { gfs2_assert_warn(sdp, sdp->sd_log_num_buf); sdp->sd_log_num_buf--; @@ -375,33 +374,24 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) * @ip: The GFS2 inode * @height: The level of this buf in the metadata (indir addr) tree (if any) * @num: The block number (device relative) of the buffer - * @new: Non-zero if we may create a new buffer * @bhp: the buffer is returned here * * Returns: errno */ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, - int new, struct buffer_head **bhp) + struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_glock *gl = ip->i_gl; struct buffer_head *bh; int ret = 0; + u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; - if (new) { - BUG_ON(height == 0); - bh = gfs2_meta_new(gl, num); - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); - gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); - } else { - u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; - ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); - if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { - brelse(bh); - ret = -EIO; - } + ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { + brelse(bh); + ret = -EIO; } *bhp = bh; return ret; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 22c5265..c30973b 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -65,12 +65,12 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, - int new, struct buffer_head **bhp); + struct buffer_head **bhp); static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, struct buffer_head **bhp) { - return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, 0, bhp); + return gfs2_meta_indirect_buffer(ip, 0, ip->i_no_addr, bhp); } struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 6f3a18f..b8c250f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -99,7 +99,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) atomic_set(&sdp->sd_log_pinned, 0); INIT_LIST_HEAD(&sdp->sd_log_le_buf); INIT_LIST_HEAD(&sdp->sd_log_le_revoke); - INIT_LIST_HEAD(&sdp->sd_log_le_rg); INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); @@ -994,6 +993,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ls->ls_jid = option; break; case Opt_id: + case Opt_nodir: /* Obsolete, but left for backward compat purposes */ break; case Opt_first: @@ -1002,12 +1002,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) goto hostdata_error; ls->ls_first = option; break; - case Opt_nodir: - ret = match_int(&tmp[0], &option); - if (ret || (option != 0 && option != 1)) - goto hostdata_error; - ls->ls_nodir = option; - break; case Opt_err: default: hostdata_error: diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6019da3..b97178e 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -652,7 +652,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, } memset(&q, 0, sizeof(struct gfs2_quota)); - err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q)); + err = gfs2_internal_read(ip, (char *)&q, &loc, sizeof(q)); if (err < 0) return err; @@ -744,7 +744,7 @@ get_a_page: i_size_write(inode, size); inode->i_mtime = inode->i_atime = CURRENT_TIME; mark_inode_dirty(inode); - return err; + return 0; unlock_out: unlock_page(page); @@ -852,7 +852,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) memset(&q, 0, sizeof(struct gfs2_quota)); pos = qd2offset(qd); - error = gfs2_internal_read(ip, NULL, (char *)&q, &pos, sizeof(q)); + error = gfs2_internal_read(ip, (char *)&q, &pos, sizeof(q)); if (error < 0) return error; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3df65c9..f74fb9b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -70,15 +70,15 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, /** * gfs2_setbit - Set a bit in the bitmaps - * @buffer: the buffer that holds the bitmaps - * @buflen: the length (in bytes) of the buffer + * @rgd: the resource group descriptor + * @buf2: the clone buffer that holds the bitmaps + * @bi: the bitmap structure * @block: the block to set * @new_state: the new state of the block * */ -static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1, - unsigned char *buf2, unsigned int offset, +static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf2, struct gfs2_bitmap *bi, u32 block, unsigned char new_state) { @@ -86,8 +86,8 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1, unsigned int buflen = bi->bi_len; const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; - byte1 = buf1 + offset + (block / GFS2_NBBY); - end = buf1 + offset + buflen; + byte1 = bi->bi_bh->b_data + bi->bi_offset + (block / GFS2_NBBY); + end = bi->bi_bh->b_data + bi->bi_offset + buflen; BUG_ON(byte1 >= end); @@ -110,7 +110,7 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1, *byte1 ^= (cur_state ^ new_state) << bit; if (buf2) { - byte2 = buf2 + offset + (block / GFS2_NBBY); + byte2 = buf2 + bi->bi_offset + (block / GFS2_NBBY); cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; *byte2 ^= (cur_state ^ new_state) << bit; } @@ -118,6 +118,7 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1, /** * gfs2_testbit - test a bit in the bitmaps + * @rgd: the resource group descriptor * @buffer: the buffer that holds the bitmaps * @buflen: the length (in bytes) of the buffer * @block: the block to read @@ -179,7 +180,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) /** * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing * a block in a given allocation state. - * @buffer: the buffer that holds the bitmaps + * @buf: the buffer that holds the bitmaps * @len: the length (in bytes) of the buffer * @goal: start search at this block's bit-pair (within @buffer) * @state: GFS2_BLKST_XXX the state of the block we're looking for. @@ -231,6 +232,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, /** * gfs2_bitcount - count the number of bits in a certain state + * @rgd: the resource group descriptor * @buffer: the buffer that holds the bitmaps * @buflen: the length (in bytes) of the buffer * @state: the state of the block we're looking for @@ -264,7 +266,6 @@ static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer, /** * gfs2_rgrp_verify - Verify that a resource group is consistent - * @sdp: the filesystem * @rgd: the rgrp * */ @@ -322,7 +323,8 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) /** * gfs2_blk2rgrpd - Find resource group for a given data/meta block number * @sdp: The GFS2 superblock - * @n: The data block number + * @blk: The data block number + * @exact: True if this needs to be an exact match * * Returns: The resource group, or NULL if not found */ @@ -380,7 +382,7 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) /** * gfs2_rgrpd_get_next - get the next RG - * @rgd: A RG + * @rgd: the resource group descriptor * * Returns: The next rgrp */ @@ -529,6 +531,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) /** * gfs2_ri_total - Total up the file system space, according to the rindex. + * @sdp: the filesystem * */ u64 gfs2_ri_total(struct gfs2_sbd *sdp) @@ -537,16 +540,14 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) struct inode *inode = sdp->sd_rindex; struct gfs2_inode *ip = GFS2_I(inode); char buf[sizeof(struct gfs2_rindex)]; - struct file_ra_state ra_state; int error, rgrps; - file_ra_state_init(&ra_state, inode->i_mapping); for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode)) break; - error = gfs2_internal_read(ip, &ra_state, buf, &pos, + error = gfs2_internal_read(ip, buf, &pos, sizeof(struct gfs2_rindex)); if (error != sizeof(struct gfs2_rindex)) break; @@ -582,13 +583,12 @@ static int rgd_insert(struct gfs2_rgrpd *rgd) /** * read_rindex_entry - Pull in a new resource index entry from the disk - * @gl: The glock covering the rindex inode + * @ip: Pointer to the rindex inode * * Returns: 0 on success, > 0 on EOF, error code otherwise */ -static int read_rindex_entry(struct gfs2_inode *ip, - struct file_ra_state *ra_state) +static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); @@ -599,7 +599,7 @@ static int read_rindex_entry(struct gfs2_inode *ip, if (pos >= i_size_read(&ip->i_inode)) return 1; - error = gfs2_internal_read(ip, ra_state, (char *)&buf, &pos, + error = gfs2_internal_read(ip, (char *)&buf, &pos, sizeof(struct gfs2_rindex)); if (error != sizeof(struct gfs2_rindex)) @@ -655,13 +655,10 @@ fail: static int gfs2_ri_update(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct inode *inode = &ip->i_inode; - struct file_ra_state ra_state; int error; - file_ra_state_init(&ra_state, inode->i_mapping); do { - error = read_rindex_entry(ip, &ra_state); + error = read_rindex_entry(ip); } while (error == 0); if (error < 0) @@ -741,7 +738,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) /** * gfs2_rgrp_go_lock - Read in a RG's header and bitmaps - * @rgd: the struct gfs2_rgrpd describing the RG to read in + * @gh: The glock holder for the resource group * * Read in all of a Resource Group's header and bitmap blocks. * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. @@ -801,7 +798,7 @@ fail: /** * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() - * @rgd: the struct gfs2_rgrpd describing the RG to read in + * @gh: The glock holder for the resource group * */ @@ -1002,11 +999,13 @@ struct gfs2_qadata *gfs2_qadata_get(struct gfs2_inode *ip) * Returns: the struct gfs2_qadata */ -static struct gfs2_blkreserv *gfs2_blkrsv_get(struct gfs2_inode *ip) +static int gfs2_blkrsv_get(struct gfs2_inode *ip) { BUG_ON(ip->i_res != NULL); - ip->i_res = kzalloc(sizeof(struct gfs2_blkreserv), GFP_NOFS); - return ip->i_res; + ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); + if (!ip->i_res) + return -ENOMEM; + return 0; } /** @@ -1038,6 +1037,8 @@ static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk) /** * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes * @rgd: The rgrp + * @last_unlinked: block address of the last dinode we unlinked + * @skip: block address we should explicitly not unlink * * Returns: 0 if no error * The inode, if one has been found, in inode. @@ -1102,7 +1103,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip /** * get_local_rgrp - Choose and lock a rgrp for allocation * @ip: the inode to reserve space for - * @rgp: the chosen and locked rgrp + * @last_unlinked: the last unlinked block * * Try to acquire rgrp in way which avoids contending with others. * @@ -1164,13 +1165,14 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) static void gfs2_blkrsv_put(struct gfs2_inode *ip) { BUG_ON(ip->i_res == NULL); - kfree(ip->i_res); + kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); ip->i_res = NULL; } /** * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for + * @requested: the number of blocks to be reserved * * Returns: errno */ @@ -1179,14 +1181,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_blkreserv *rs; - int error = 0; + int error; u64 last_unlinked = NO_BLOCK; int tries = 0; - rs = gfs2_blkrsv_get(ip); - if (!rs) - return -ENOMEM; + error = gfs2_blkrsv_get(ip); + if (error) + return error; + rs = ip->i_res; rs->rs_requested = requested; if (gfs2_assert_warn(sdp, requested)) { error = -EINVAL; @@ -1268,7 +1271,6 @@ static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) * @rgd: the resource group descriptor * @goal: the goal block within the RG (start here to search for avail block) * @state: GFS2_BLKST_XXX the before-allocation state to find - * @dinode: TRUE if the first block we allocate is for a dinode * @rbi: address of the pointer to the bitmap containing the block found * * Walk rgrp's bitmap to find bits that represent a block in @state. @@ -1282,13 +1284,12 @@ static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) * Returns: the block number found relative to the bitmap rbi */ -static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, - unsigned char state, +static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, unsigned char state, struct gfs2_bitmap **rbi) { struct gfs2_bitmap *bi = NULL; const u32 length = rgd->rd_length; - u32 blk = BFITNOENT; + u32 biblk = BFITNOENT; unsigned int buf, x; const u8 *buffer = NULL; @@ -1325,8 +1326,8 @@ do_search: if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) buffer = bi->bi_clone + bi->bi_offset; - blk = gfs2_bitfit(buffer, bi->bi_len, goal, state); - if (blk != BFITNOENT) + biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state); + if (biblk != BFITNOENT) break; if ((goal == 0) && (state == GFS2_BLKST_FREE)) @@ -1339,10 +1340,10 @@ skip: goal = 0; } - if (blk != BFITNOENT) + if (biblk != BFITNOENT) *rbi = bi; - return blk; + return biblk; } /** @@ -1367,8 +1368,8 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, *n = 0; buffer = bi->bi_bh->b_data + bi->bi_offset; gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi, blk, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + gfs2_setbit(rgd, bi->bi_clone, bi, blk, + dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); (*n)++; goal = blk; while (*n < elen) { @@ -1378,8 +1379,7 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi, if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != GFS2_BLKST_FREE) break; - gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi, goal, GFS2_BLKST_USED); + gfs2_setbit(rgd, bi->bi_clone, bi, goal, GFS2_BLKST_USED); (*n)++; } blk = gfs2_bi2rgd_blk(bi, blk); @@ -1436,8 +1436,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, bi->bi_len); } gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset, - bi, buf_blk, new_state); + gfs2_setbit(rgd, NULL, bi, buf_blk, new_state); } return rgd; @@ -1557,7 +1556,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, ip->i_inode.i_gid); rgd->rd_free_clone -= *nblocks; - trace_gfs2_block_alloc(ip, block, *nblocks, + trace_gfs2_block_alloc(ip, rgd, block, *nblocks, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); *bn = block; return 0; @@ -1584,7 +1583,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); if (!rgd) return; - trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); + trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; rgd->rd_flags &= ~GFS2_RGF_TRIMMED; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); @@ -1622,7 +1621,7 @@ void gfs2_unlink_di(struct inode *inode) rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); if (!rgd) return; - trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED); + trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); } @@ -1652,7 +1651,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { gfs2_free_uninit_di(rgd, ip->i_no_addr); - trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE); + trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); gfs2_meta_wipe(ip, ip->i_no_addr, 1); } @@ -1752,7 +1751,6 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, * and initialize an array of glock holders for them * @rlist: the list of resource groups * @state: the lock state to acquire the RG lock in - * @flags: the modifier flags for the holder structures * * FIXME: Don't use NOFAIL * diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 6172fa7..713e621 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1554,7 +1554,7 @@ out_unlock: out: /* Case 3 starts here */ truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; flush_delayed_work_sync(&ip->i_gl->gl_work); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d33172c..9c2592b 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -368,10 +368,7 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) struct gfs2_jdesc *jd; int rv; - rv = -ESHUTDOWN; spin_lock(&sdp->sd_jindex_spin); - if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) - goto out; rv = -EBUSY; if (sdp->sd_jdesc->jd_jid == jid) goto out; @@ -396,8 +393,13 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) if (rv != 1) return -EINVAL; - rv = gfs2_recover_set(sdp, jid); + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) { + rv = -ESHUTDOWN; + goto out; + } + rv = gfs2_recover_set(sdp, jid); +out: return rv ? rv : len; } diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index dfa89cd..1b8b815 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -457,10 +457,10 @@ TRACE_EVENT(gfs2_bmap, /* Keep track of blocks as they are allocated/freed */ TRACE_EVENT(gfs2_block_alloc, - TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len, - u8 block_state), + TP_PROTO(const struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 block, unsigned len, u8 block_state), - TP_ARGS(ip, block, len, block_state), + TP_ARGS(ip, rgd, block, len, block_state), TP_STRUCT__entry( __field( dev_t, dev ) @@ -468,6 +468,8 @@ TRACE_EVENT(gfs2_block_alloc, __field( u64, inum ) __field( u32, len ) __field( u8, block_state ) + __field( u64, rd_addr ) + __field( u32, rd_free_clone ) ), TP_fast_assign( @@ -476,14 +478,18 @@ TRACE_EVENT(gfs2_block_alloc, __entry->inum = ip->i_no_addr; __entry->len = len; __entry->block_state = block_state; + __entry->rd_addr = rgd->rd_addr; + __entry->rd_free_clone = rgd->rd_free_clone; ), - TP_printk("%u,%u bmap %llu alloc %llu/%lu %s", + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->start, (unsigned long)__entry->len, - block_state_name(__entry->block_state)) + block_state_name(__entry->block_state), + (unsigned long long)__entry->rd_addr, + __entry->rd_free_clone) ); #endif /* _TRACE_GFS2_H */ diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 86ac75d..ad3e2fb 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -50,8 +50,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (revokes) tr->tr_reserved += gfs2_struct2blk(sdp, revokes, sizeof(u64)); - INIT_LIST_HEAD(&tr->tr_list_buf); - gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); error = gfs2_glock_nq(&tr->tr_t_gh); @@ -93,10 +91,21 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) up_read(&sdp->sd_log_flush_lock); } +static void gfs2_print_trans(const struct gfs2_trans *tr) +{ + print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", + tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); + printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", + tr->tr_num_buf_new, tr->tr_num_buf_rm, + tr->tr_num_databuf_new, tr->tr_num_databuf_rm, + tr->tr_num_revoke, tr->tr_num_revoke_rm); +} + void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; - + s64 nbuf; BUG_ON(!tr); current->journal_info = NULL; @@ -110,16 +119,13 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) return; } - if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) { - fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ", - tr->tr_num_buf, tr->tr_blocks); - print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); - } - if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) { - fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ", - tr->tr_num_revoke, tr->tr_revokes); - print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); - } + nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new; + nbuf -= tr->tr_num_buf_rm; + nbuf -= tr->tr_num_databuf_rm; + + if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) && + (tr->tr_num_revoke <= tr->tr_revokes))) + gfs2_print_trans(tr); gfs2_log_commit(sdp, tr); if (tr->tr_t_gh.gh_gl) { @@ -152,16 +158,16 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) gfs2_attach_bufdata(gl, bh, meta); bd = bh->b_private; } - lops_add(sdp, &bd->bd_le); + lops_add(sdp, bd); } void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { - BUG_ON(!list_empty(&bd->bd_le.le_list)); + BUG_ON(!list_empty(&bd->bd_list)); BUG_ON(!list_empty(&bd->bd_ail_st_list)); BUG_ON(!list_empty(&bd->bd_ail_gl_list)); - lops_init_le(&bd->bd_le, &gfs2_revoke_lops); - lops_add(sdp, &bd->bd_le); + lops_init_le(bd, &gfs2_revoke_lops); + lops_add(sdp, bd); } void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) @@ -171,9 +177,9 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) unsigned int n = len; gfs2_log_lock(sdp); - list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) { + list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_list) { if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) { - list_del_init(&bd->bd_le.le_list); + list_del_init(&bd->bd_list); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); sdp->sd_log_num_revoke--; kmem_cache_free(gfs2_bufdata_cachep, bd); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 9e7765e..f00d7c5 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -25,7 +25,8 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; struct kmem_cache *gfs2_quotad_cachep __read_mostly; -mempool_t *gfs2_bh_pool __read_mostly; +struct kmem_cache *gfs2_rsrv_cachep __read_mostly; +mempool_t *gfs2_page_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index a4ce76c..3586b0d 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -152,7 +152,8 @@ extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; -extern mempool_t *gfs2_bh_pool; +extern struct kmem_cache *gfs2_rsrv_cachep; +extern mempool_t *gfs2_page_pool; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 737dbeb..761ec06 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -532,7 +532,7 @@ out: void hfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; iput(HFS_I(inode)->rsrc_inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index ceb1c28..a9bca4b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -154,7 +154,7 @@ static void hfsplus_evict_inode(struct inode *inode) { dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; iput(HFSPLUS_I(inode)->rsrc_inode); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07c516b..2afa5bb 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -240,7 +240,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) static void hostfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (HOSTFS_I(inode)->fd != -1) { close_file(&HOSTFS_I(inode)->fd); HOSTFS_I(inode)->fd = -1; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 3b2cec2..b43066c 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -299,7 +299,7 @@ void hpfs_write_if_changed(struct inode *inode) void hpfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (!inode->i_nlink) { hpfs_lock(inode->i_sb); hpfs_remove_fnode(inode->i_sb, inode->i_ino); diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index a80e45a..d4f93b5 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -614,7 +614,7 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb) void hppfs_evict_inode(struct inode *ino) { - end_writeback(ino); + clear_inode(ino); dput(HPPFS_I(ino)->proc_dentry); mntput(ino->i_sb->s_fs_info); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 001ef01..cc9281b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -393,7 +393,7 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) static void hugetlbfs_evict_inode(struct inode *inode) { truncate_hugepages(inode, 0); - end_writeback(inode); + clear_inode(inode); } static inline void @@ -135,8 +135,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fop = &empty_fops; inode->__i_nlink = 1; inode->i_opflags = 0; - inode->i_uid = 0; - inode->i_gid = 0; + i_uid_write(inode, 0); + i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_blocks = 0; @@ -486,7 +486,7 @@ void __remove_inode_hash(struct inode *inode) } EXPORT_SYMBOL(__remove_inode_hash); -void end_writeback(struct inode *inode) +void clear_inode(struct inode *inode) { might_sleep(); /* @@ -500,11 +500,10 @@ void end_writeback(struct inode *inode) BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); - inode_sync_wait(inode); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } -EXPORT_SYMBOL(end_writeback); +EXPORT_SYMBOL(clear_inode); /* * Free the inode passed in, removing it from the lists it is still connected @@ -531,12 +530,20 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + /* + * Wait for flusher thread to be done with the inode so that filesystem + * does not start destroying it while writeback is still running. Since + * the inode has I_FREEING set, flusher thread won't start new work on + * the inode. We just have to wait for running writeback to finish. + */ + inode_wait_for_writeback(inode); + if (op->evict_inode) { op->evict_inode(inode); } else { if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); } if (S_ISBLK(inode->i_mode) && inode->i_bdev) bd_forget(inode); @@ -1647,6 +1654,7 @@ void __init inode_init_early(void) HASH_EARLY, &i_hash_shift, &i_hash_mask, + 0, 0); for (loop = 0; loop < (1U << i_hash_shift); loop++) @@ -1677,6 +1685,7 @@ void __init inode_init(void) 0, &i_hash_shift, &i_hash_mask, + 0, 0); for (loop = 0; loop < (1U << i_hash_shift); loop++) @@ -1732,11 +1741,9 @@ EXPORT_SYMBOL(inode_init_owner); */ bool inode_owner_or_capable(const struct inode *inode) { - struct user_namespace *ns = inode_userns(inode); - - if (current_user_ns() == ns && current_fsuid() == inode->i_uid) + if (uid_eq(current_fsuid(), inode->i_uid)) return true; - if (ns_capable(ns, CAP_FOWNER)) + if (inode_capable(inode, CAP_FOWNER)) return true; return false; } diff --git a/fs/ioprio.c b/fs/ioprio.c index 0f1b951..5e6dbe89 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -37,8 +37,8 @@ int set_task_ioprio(struct task_struct *task, int ioprio) rcu_read_lock(); tcred = __task_cred(task); - if (tcred->uid != cred->euid && - tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) { + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); return -EPERM; } @@ -65,6 +65,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) struct task_struct *p, *g; struct user_struct *user; struct pid *pgrp; + kuid_t uid; int ret; switch (class) { @@ -110,16 +111,19 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); + if (!uid_valid(uid)) + break; if (!who) user = current_user(); else - user = find_user(who); + user = find_user(uid); if (!user) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != who) + if (!uid_eq(task_uid(p), uid)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -174,6 +178,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) struct task_struct *g, *p; struct user_struct *user; struct pid *pgrp; + kuid_t uid; int ret = -ESRCH; int tmpio; @@ -203,16 +208,17 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); if (!who) user = current_user(); else - user = find_user(who); + user = find_user(uid); if (!user) break; do_each_thread(g, p) { - if (__task_cred(p)->uid != user->uid) + if (!uid_eq(task_uid(p), user->uid)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 05f0754..08c0304 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -508,20 +508,19 @@ int cleanup_journal_tail(journal_t *journal) /* * We need to make sure that any blocks that were recently written out * --- perhaps by log_do_checkpoint() --- are flushed out before we - * drop the transactions from the journal. It's unlikely this will be - * necessary, especially with an appropriately sized journal, but we - * need this to guarantee correctness. Fortunately - * cleanup_journal_tail() doesn't get called all that often. + * drop the transactions from the journal. Similarly we need to be sure + * superblock makes it to disk before next transaction starts reusing + * freed space (otherwise we could replay some blocks of the new + * transaction thinking they belong to the old one). So we use + * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially + * with an appropriately sized journal, but we need this to guarantee + * correctness. Fortunately cleanup_journal_tail() doesn't get called + * all that often. */ - if (journal->j_flags & JFS_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + journal_update_sb_log_tail(journal, first_tid, blocknr, + WRITE_FLUSH_FUA); spin_lock(&journal->j_state_lock); - if (!tid_gt(first_tid, journal->j_tail_sequence)) { - spin_unlock(&journal->j_state_lock); - /* Someone else cleaned up journal so return 0 */ - return 0; - } /* OK, update the superblock to recover the freed space. * Physical blocks come first: have we wrapped beyond the end of * the log? */ @@ -539,8 +538,6 @@ int cleanup_journal_tail(journal_t *journal) journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); - if (!(journal->j_flags & JFS_ABORT)) - journal_update_superblock(journal, 1); return 0; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index f2b9a57..52c15c7 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -298,6 +298,7 @@ void journal_commit_transaction(journal_t *journal) int tag_flag; int i; struct blk_plug plug; + int write_op = WRITE; /* * First job: lock down the current transaction and wait for @@ -307,7 +308,16 @@ void journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); - journal_update_superblock(journal, 1); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + journal_update_sb_log_tail(journal, journal->j_tail_sequence, + journal->j_tail, WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } @@ -413,13 +423,16 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 2\n"); + if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) + write_op = WRITE_SYNC; + /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ blk_start_plug(&plug); err = journal_submit_data_buffers(journal, commit_transaction, - WRITE_SYNC); + write_op); blk_finish_plug(&plug); /* @@ -478,7 +491,7 @@ void journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); - journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC); + journal_write_revoke_records(journal, commit_transaction, write_op); /* * If we found any dirty or locked buffers, then we should have @@ -649,7 +662,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE_SYNC, bh); + submit_bh(write_op, bh); } cond_resched(); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 0971e92..425c2f2 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -563,6 +563,8 @@ int log_wait_commit(journal_t *journal, tid_t tid) spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + if (!tid_geq(journal->j_commit_waited, tid)) + journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); @@ -921,8 +923,33 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - journal_update_superblock(journal, 1); + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JFS_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } return journal_start_thread(journal); } @@ -999,35 +1026,15 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -/** - * void journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void journal_update_superblock(journal_t *journal, int wait) +static void journal_write_superblock(journal_t *journal, int write_op) { - journal_superblock_t *sb = journal->j_superblock; struct buffer_head *bh = journal->j_sb_buffer; + int ret; - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - goto out; - } - + trace_journal_write_superblock(journal, write_op); + if (!(journal->j_flags & JFS_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { char b[BDEVNAME_SIZE]; /* @@ -1045,42 +1052,100 @@ void journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: Error %d detected " + "when updating journal superblock for %s.\n", + ret, journal_dev_name(journal, b)); + } +} + +/** + * journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned int tail_block, int write_op) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + journal_write_superblock(journal, write_op); + + /* Log is no longer empty */ + spin_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); + sb->s_start = cpu_to_be32(0); spin_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: I/O error detected " - "when updating journal superblock for %s.\n", - journal_dev_name(journal, b)); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - } else - write_dirty_buffer(bh, WRITE); + journal_write_superblock(journal, WRITE_FUA); - trace_jbd_update_superblock_end(journal, wait); -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + spin_lock(&journal->j_state_lock); + /* Log is empty */ + journal->j_flags |= JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; spin_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JFS_FLUSHED; - else - journal->j_flags |= JFS_FLUSHED; + jbd_debug(1, "JBD: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal, WRITE_SYNC); } /* @@ -1251,6 +1316,8 @@ int journal_destroy(journal_t *journal) /* Force any old transactions to disk */ + /* We cannot race with anybody but must keep assertions happy */ + mutex_lock(&journal->j_checkpoint_mutex); /* Totally anal locking here... */ spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { @@ -1266,16 +1333,14 @@ int journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; journal->j_tail_sequence = ++journal->j_transaction_sequence; - journal_update_superblock(journal, 1); - } else { + mark_journal_empty(journal); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } + mutex_unlock(&journal->j_checkpoint_mutex); if (journal->j_inode) iput(journal->j_inode); @@ -1455,7 +1520,6 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned int old_tail; spin_lock(&journal->j_state_lock); @@ -1490,6 +1554,7 @@ int journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1497,14 +1562,9 @@ int journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); - journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1544,8 +1604,12 @@ int journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = journal_skip_recovery(journal); - if (write) - journal_update_superblock(journal, 1); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; @@ -1613,7 +1677,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __journal_abort_hard(journal); if (errno) - journal_update_superblock(journal, 1); + journal_update_sb_errno(journal); } /** diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index b2a7e52..febc10d 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1433,8 +1433,6 @@ int journal_stop(handle_t *handle) } } - if (handle->h_sync) - transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index bb6f993..3d3092e 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -240,7 +240,7 @@ void jffs2_evict_inode (struct inode *inode) jffs2_dbg(1, "%s(): ino #%lu mode %o\n", __func__, inode->i_ino, inode->i_mode); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); jffs2_do_clear_inode(c, f); } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 77b69b2..4692bf3 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -169,7 +169,7 @@ void jfs_evict_inode(struct inode *inode) } else { truncate_inode_pages(&inode->i_data, 0); } - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); } @@ -68,7 +68,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na int dcache_dir_open(struct inode *inode, struct file *file) { - static struct qstr cursor_name = {.len = 1, .name = "."}; + static struct qstr cursor_name = QSTR_INIT(".", 1); file->private_data = d_alloc(file->f_path.dentry, &cursor_name); @@ -225,7 +225,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); struct dentry *dentry; struct inode *root; - struct qstr d_name = {.name = name, .len = strlen(name)}; + struct qstr d_name = QSTR_INIT(name, strlen(name)); if (IS_ERR(s)) return ERR_CAST(s); @@ -1446,7 +1446,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) struct inode *inode = dentry->d_inode; int error; - if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) + if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) return -EACCES; if (!S_ISREG(inode->i_mode)) return -EINVAL; diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index e3ab5e5..f1cb512 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -2175,7 +2175,7 @@ void logfs_evict_inode(struct inode *inode) } } truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); /* Cheaper version of write_inode. All changes are concealed in * aliases, which are moved back. No write to the medium happens. diff --git a/fs/minix/inode.c b/fs/minix/inode.c index fcb05d2..2a503ad 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -32,7 +32,7 @@ static void minix_evict_inode(struct inode *inode) minix_truncate(inode); } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (!inode->i_nlink) minix_free_inode(inode); } @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/export.h> +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/namei.h> @@ -116,47 +117,37 @@ * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ -static int do_getname(const char __user *filename, char *page) -{ - int retval; - unsigned long len = PATH_MAX; - - if (!segment_eq(get_fs(), KERNEL_DS)) { - if ((unsigned long) filename >= TASK_SIZE) - return -EFAULT; - if (TASK_SIZE - (unsigned long) filename < PATH_MAX) - len = TASK_SIZE - (unsigned long) filename; - } - - retval = strncpy_from_user(page, filename, len); - if (retval > 0) { - if (retval < len) - return 0; - return -ENAMETOOLONG; - } else if (!retval) - retval = -ENOENT; - return retval; -} - static char *getname_flags(const char __user *filename, int flags, int *empty) { - char *result = __getname(); - int retval; + char *result = __getname(), *err; + int len; - if (!result) + if (unlikely(!result)) return ERR_PTR(-ENOMEM); - retval = do_getname(filename, result); - if (retval < 0) { - if (retval == -ENOENT && empty) + len = strncpy_from_user(result, filename, PATH_MAX); + err = ERR_PTR(len); + if (unlikely(len < 0)) + goto error; + + /* The empty path is special. */ + if (unlikely(!len)) { + if (empty) *empty = 1; - if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { - __putname(result); - return ERR_PTR(retval); - } + err = ERR_PTR(-ENOENT); + if (!(flags & LOOKUP_EMPTY)) + goto error; + } + + err = ERR_PTR(-ENAMETOOLONG); + if (likely(len < PATH_MAX)) { + audit_getname(result); + return result; } - audit_getname(result); - return result; + +error: + __putname(result); + return err; } char *getname(const char __user * filename) @@ -228,10 +219,7 @@ static int acl_permission_check(struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - if (current_user_ns() != inode_userns(inode)) - goto other_perms; - - if (likely(current_fsuid() == inode->i_uid)) + if (likely(uid_eq(current_fsuid(), inode->i_uid))) mode >>= 6; else { if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { @@ -244,7 +232,6 @@ static int acl_permission_check(struct inode *inode, int mask) mode >>= 3; } -other_perms: /* * If the DACs are ok we don't need any capability check. */ @@ -280,10 +267,10 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) + if (inode_capable(inode, CAP_DAC_OVERRIDE)) return 0; if (!(mask & MAY_WRITE)) - if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) + if (inode_capable(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } @@ -293,7 +280,7 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) + if (inode_capable(inode, CAP_DAC_OVERRIDE)) return 0; /* @@ -301,7 +288,7 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) + if (inode_capable(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; @@ -1154,12 +1141,25 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - *inode = nd->inode; - dentry = __d_lookup_rcu(parent, name, &seq, inode); + dentry = __d_lookup_rcu(parent, name, &seq, nd->inode); if (!dentry) goto unlazy; - /* Memory barrier in read_seqcount_begin of child is enough */ + /* + * This sequence count validates that the inode matches + * the dentry name information from lookup. + */ + *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, seq)) + return -ECHILD; + + /* + * This sequence count validates that the parent had no + * changes while we did the lookup of the dentry above. + * + * The memory barrier in read_seqcount_begin of child is + * enough, we can use __read_seqcount_retry here. + */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; nd->seq = seq; @@ -1452,7 +1452,8 @@ EXPORT_SYMBOL(full_name_hash); */ static inline unsigned long hash_name(const char *name, unsigned int *hashp) { - unsigned long a, mask, hash, len; + unsigned long a, b, adata, bdata, mask, hash, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; hash = a = 0; len = -sizeof(unsigned long); @@ -1460,17 +1461,18 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) hash = (hash + a) * 9; len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); - /* Do we have any NUL or '/' bytes in this word? */ - mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/')); - } while (!mask); - - /* The mask *below* the first high bit set */ - mask = (mask - 1) & ~mask; - mask >>= 7; - hash += a & mask; + b = a ^ REPEAT_BYTE('/'); + } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); + + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + + mask = create_zero_mask(adata | bdata); + + hash += a & zero_bytemask(mask); *hashp = fold_hash(hash); - return len + count_masked_bytes(mask); + return len + find_zero(mask); } #else @@ -1931,19 +1933,15 @@ static int user_path_parent(int dfd, const char __user *path, */ static inline int check_sticky(struct inode *dir, struct inode *inode) { - uid_t fsuid = current_fsuid(); + kuid_t fsuid = current_fsuid(); if (!(dir->i_mode & S_ISVTX)) return 0; - if (current_user_ns() != inode_userns(inode)) - goto other_userns; - if (inode->i_uid == fsuid) + if (uid_eq(inode->i_uid, fsuid)) return 0; - if (dir->i_uid == fsuid) + if (uid_eq(dir->i_uid, fsuid)) return 0; - -other_userns: - return !ns_capable(inode_userns(inode), CAP_FOWNER); + return !inode_capable(inode, CAP_FOWNER); } /* @@ -2531,8 +2529,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && - !ns_capable(inode_userns(dir), CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 87484fb..333df07 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -292,7 +292,7 @@ static void ncp_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (S_ISDIR(inode->i_mode)) { DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 2a0e6c5..f90f4f5 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -29,9 +29,20 @@ config NFS_FS If unsure, say N. +config NFS_V2 + bool "NFS client support for NFS version 2" + depends on NFS_FS + default y + help + This option enables support for version 2 of the NFS protocol + (RFC 1094) in the kernel's NFS client. + + If unsure, say Y. + config NFS_V3 bool "NFS client support for NFS version 3" depends on NFS_FS + default y help This option enables support for version 3 of the NFS protocol (RFC 1813) in the kernel's NFS client. diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index b58613d..7ddd45d 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -4,11 +4,12 @@ obj-$(CONFIG_NFS_FS) += nfs.o -nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ - direct.o pagelist.o proc.o read.o symlink.o unlink.o \ +nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ + direct.o pagelist.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o \ dns_resolve.o cache_lib.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o +nfs-$(CONFIG_NFS_V2) += proc.o nfs2xdr.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 7f6a23f..7ae8a60 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -187,7 +187,6 @@ static void bl_end_io_read(struct bio *bio, int err) struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct nfs_read_data *rdata = (struct nfs_read_data *)par->data; do { struct page *page = bvec->bv_page; @@ -198,9 +197,12 @@ static void bl_end_io_read(struct bio *bio, int err) SetPageUptodate(page); } while (bvec >= bio->bi_io_vec); if (!uptodate) { - if (!rdata->pnfs_error) - rdata->pnfs_error = -EIO; - pnfs_set_lo_fail(rdata->lseg); + struct nfs_read_data *rdata = par->data; + struct nfs_pgio_header *header = rdata->header; + + if (!header->pnfs_error) + header->pnfs_error = -EIO; + pnfs_set_lo_fail(header->lseg); } bio_put(bio); put_parallel(par); @@ -221,7 +223,7 @@ bl_end_par_io_read(void *data, int unused) { struct nfs_read_data *rdata = data; - rdata->task.tk_status = rdata->pnfs_error; + rdata->task.tk_status = rdata->header->pnfs_error; INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); schedule_work(&rdata->task.u.tk_work); } @@ -229,6 +231,7 @@ bl_end_par_io_read(void *data, int unused) static enum pnfs_try_status bl_read_pagelist(struct nfs_read_data *rdata) { + struct nfs_pgio_header *header = rdata->header; int i, hole; struct bio *bio = NULL; struct pnfs_block_extent *be = NULL, *cow_read = NULL; @@ -239,7 +242,7 @@ bl_read_pagelist(struct nfs_read_data *rdata) int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, - rdata->npages, f_offset, (unsigned int)rdata->args.count); + rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); par = alloc_parallel(rdata); if (!par) @@ -249,17 +252,17 @@ bl_read_pagelist(struct nfs_read_data *rdata) isect = (sector_t) (f_offset >> SECTOR_SHIFT); /* Code assumes extents are page-aligned */ - for (i = pg_index; i < rdata->npages; i++) { + for (i = pg_index; i < rdata->pages.npages; i++) { if (!extent_length) { /* We've used up the previous extent */ bl_put_extent(be); bl_put_extent(cow_read); bio = bl_submit_bio(READ, bio); /* Get the next one */ - be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg), + be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); if (!be) { - rdata->pnfs_error = -EIO; + header->pnfs_error = -EIO; goto out; } extent_length = be->be_length - @@ -282,11 +285,12 @@ bl_read_pagelist(struct nfs_read_data *rdata) struct pnfs_block_extent *be_read; be_read = (hole && cow_read) ? cow_read : be; - bio = bl_add_page_to_bio(bio, rdata->npages - i, READ, + bio = bl_add_page_to_bio(bio, rdata->pages.npages - i, + READ, isect, pages[i], be_read, bl_end_io_read, par); if (IS_ERR(bio)) { - rdata->pnfs_error = PTR_ERR(bio); + header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } @@ -294,9 +298,9 @@ bl_read_pagelist(struct nfs_read_data *rdata) isect += PAGE_CACHE_SECTORS; extent_length -= PAGE_CACHE_SECTORS; } - if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) { + if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { rdata->res.eof = 1; - rdata->res.count = rdata->inode->i_size - f_offset; + rdata->res.count = header->inode->i_size - f_offset; } else { rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; } @@ -345,7 +349,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err) struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; do { struct page *page = bvec->bv_page; @@ -358,9 +361,12 @@ static void bl_end_io_write_zero(struct bio *bio, int err) } while (bvec >= bio->bi_io_vec); if (unlikely(!uptodate)) { - if (!wdata->pnfs_error) - wdata->pnfs_error = -EIO; - pnfs_set_lo_fail(wdata->lseg); + struct nfs_write_data *data = par->data; + struct nfs_pgio_header *header = data->header; + + if (!header->pnfs_error) + header->pnfs_error = -EIO; + pnfs_set_lo_fail(header->lseg); } bio_put(bio); put_parallel(par); @@ -370,12 +376,13 @@ static void bl_end_io_write(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + struct nfs_write_data *data = par->data; + struct nfs_pgio_header *header = data->header; if (!uptodate) { - if (!wdata->pnfs_error) - wdata->pnfs_error = -EIO; - pnfs_set_lo_fail(wdata->lseg); + if (!header->pnfs_error) + header->pnfs_error = -EIO; + pnfs_set_lo_fail(header->lseg); } bio_put(bio); put_parallel(par); @@ -391,9 +398,9 @@ static void bl_write_cleanup(struct work_struct *work) dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); wdata = container_of(task, struct nfs_write_data, task); - if (likely(!wdata->pnfs_error)) { + if (likely(!wdata->header->pnfs_error)) { /* Marks for LAYOUTCOMMIT */ - mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), wdata->args.offset, wdata->args.count); } pnfs_ld_write_done(wdata); @@ -404,12 +411,12 @@ static void bl_end_par_io_write(void *data, int num_se) { struct nfs_write_data *wdata = data; - if (unlikely(wdata->pnfs_error)) { - bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval, + if (unlikely(wdata->header->pnfs_error)) { + bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, num_se); } - wdata->task.tk_status = wdata->pnfs_error; + wdata->task.tk_status = wdata->header->pnfs_error; wdata->verf.committed = NFS_FILE_SYNC; INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); schedule_work(&wdata->task.u.tk_work); @@ -540,6 +547,7 @@ check_page: static enum pnfs_try_status bl_write_pagelist(struct nfs_write_data *wdata, int sync) { + struct nfs_pgio_header *header = wdata->header; int i, ret, npg_zero, pg_index, last = 0; struct bio *bio = NULL; struct pnfs_block_extent *be = NULL, *cow_read = NULL; @@ -552,7 +560,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) pgoff_t index; u64 temp; int npg_per_block = - NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); /* At this point, wdata->pages is a (sequential) list of nfs_pages. @@ -566,7 +574,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync) /* At this point, have to be more careful with error handling */ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); - be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); + be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); if (!be || !is_writable(be, isect)) { dprintk("%s no matching extents!\n", __func__); goto out_mds; @@ -597,10 +605,10 @@ fill_invalid_ext: dprintk("%s zero %dth page: index %lu isect %llu\n", __func__, npg_zero, index, (unsigned long long)isect); - page = bl_find_get_zeroing_page(wdata->inode, index, + page = bl_find_get_zeroing_page(header->inode, index, cow_read); if (unlikely(IS_ERR(page))) { - wdata->pnfs_error = PTR_ERR(page); + header->pnfs_error = PTR_ERR(page); goto out; } else if (page == NULL) goto next_page; @@ -612,7 +620,7 @@ fill_invalid_ext: __func__, ret); end_page_writeback(page); page_cache_release(page); - wdata->pnfs_error = ret; + header->pnfs_error = ret; goto out; } if (likely(!bl_push_one_short_extent(be->be_inval))) @@ -620,11 +628,11 @@ fill_invalid_ext: else { end_page_writeback(page); page_cache_release(page); - wdata->pnfs_error = -ENOMEM; + header->pnfs_error = -ENOMEM; goto out; } /* FIXME: This should be done in bi_end_io */ - mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + mark_extents_written(BLK_LSEG2EXT(header->lseg), page->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE); @@ -632,7 +640,7 @@ fill_invalid_ext: isect, page, be, bl_end_io_write_zero, par); if (IS_ERR(bio)) { - wdata->pnfs_error = PTR_ERR(bio); + header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } @@ -647,16 +655,16 @@ next_page: /* Middle pages */ pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; - for (i = pg_index; i < wdata->npages; i++) { + for (i = pg_index; i < wdata->pages.npages; i++) { if (!extent_length) { /* We've used up the previous extent */ bl_put_extent(be); bio = bl_submit_bio(WRITE, bio); /* Get the next one */ - be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), + be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, NULL); if (!be || !is_writable(be, isect)) { - wdata->pnfs_error = -EINVAL; + header->pnfs_error = -EINVAL; goto out; } if (be->be_state == PNFS_BLOCK_INVALID_DATA) { @@ -664,7 +672,7 @@ next_page: be->be_inval))) par->bse_count++; else { - wdata->pnfs_error = -ENOMEM; + header->pnfs_error = -ENOMEM; goto out; } } @@ -677,15 +685,15 @@ next_page: if (unlikely(ret)) { dprintk("%s bl_mark_sectors_init fail %d\n", __func__, ret); - wdata->pnfs_error = ret; + header->pnfs_error = ret; goto out; } } - bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, + bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, isect, pages[i], be, bl_end_io_write, par); if (IS_ERR(bio)) { - wdata->pnfs_error = PTR_ERR(bio); + header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index a5c88a5..c965542 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -123,7 +123,7 @@ nfs4_blk_decode_device(struct nfs_server *server, uint8_t *dataptr; DECLARE_WAITQUEUE(wq, current); int offset, len, i, rc; - struct net *net = server->nfs_client->net; + struct net *net = server->nfs_client->cl_net; struct nfs_net *nn = net_generic(net, nfs_net_id); struct bl_dev_msg *reply = &nn->bl_mount_reply; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 60f7e4e..7d10875 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -65,7 +65,7 @@ static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) { int ret = 0; - struct nfs_net *nn = net_generic(clp->net, nfs_net_id); + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (clp->rpc_ops->version != 4 || minorversion != 0) return ret; @@ -90,7 +90,9 @@ static bool nfs4_disable_idmapping = true; * RPC cruft for NFS */ static const struct rpc_version *nfs_version[5] = { +#ifdef CONFIG_NFS_V2 [2] = &nfs_version2, +#endif #ifdef CONFIG_NFS_V3 [3] = &nfs_version3, #endif @@ -129,6 +131,7 @@ const struct rpc_program nfsacl_program = { #endif /* CONFIG_NFS_V3_ACL */ struct nfs_client_initdata { + unsigned long init_flags; const char *hostname; const struct sockaddr *addr; size_t addrlen; @@ -172,7 +175,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_proto = cl_init->proto; - clp->net = get_net(cl_init->net); + clp->cl_net = get_net(cl_init->net); #ifdef CONFIG_NFS_V4 err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); @@ -182,7 +185,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ spin_lock_init(&clp->cl_lock); INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); - clp->cl_boot_time = CURRENT_TIME; clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; @@ -207,6 +209,7 @@ static void nfs4_shutdown_session(struct nfs_client *clp) if (nfs4_has_session(clp)) { nfs4_deviceid_purge_client(clp); nfs4_destroy_session(clp->cl_session); + nfs4_destroy_clientid(clp); } } @@ -235,6 +238,9 @@ static void nfs4_shutdown_client(struct nfs_client *clp) nfs_idmap_delete(clp); rpc_destroy_wait_queue(&clp->cl_rpcwaitq); + kfree(clp->cl_serverowner); + kfree(clp->cl_serverscope); + kfree(clp->cl_implid); } /* idr_remove_all is not needed as all id's are removed by nfs_put_client */ @@ -248,7 +254,7 @@ void nfs_cleanup_cb_ident_idr(struct net *net) /* nfs_client_lock held */ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { - struct nfs_net *nn = net_generic(clp->net, nfs_net_id); + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (clp->cl_cb_ident) idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); @@ -301,10 +307,8 @@ static void nfs_free_client(struct nfs_client *clp) if (clp->cl_machine_cred != NULL) put_rpccred(clp->cl_machine_cred); - put_net(clp->net); + put_net(clp->cl_net); kfree(clp->cl_hostname); - kfree(clp->server_scope); - kfree(clp->impl_id); kfree(clp); dprintk("<-- nfs_free_client()\n"); @@ -321,7 +325,7 @@ void nfs_put_client(struct nfs_client *clp) return; dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); - nn = net_generic(clp->net, nfs_net_id); + nn = net_generic(clp->cl_net, nfs_net_id); if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); @@ -456,6 +460,8 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr, clp->cl_cons_state == NFS_CS_SESSION_INITING)) return false; + smp_rmb(); + /* Match the version and minorversion */ if (clp->rpc_ops->version != 4 || clp->cl_minorversion != minorversion) @@ -504,6 +510,47 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat return NULL; } +static bool nfs_client_init_is_complete(const struct nfs_client *clp) +{ + return clp->cl_cons_state != NFS_CS_INITING; +} + +int nfs_wait_client_init_complete(const struct nfs_client *clp) +{ + return wait_event_killable(nfs_client_active_wq, + nfs_client_init_is_complete(clp)); +} + +/* + * Found an existing client. Make sure it's ready before returning. + */ +static struct nfs_client * +nfs_found_client(const struct nfs_client_initdata *cl_init, + struct nfs_client *clp) +{ + int error; + + error = nfs_wait_client_init_complete(clp); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(-ERESTARTSYS); + } + + if (clp->cl_cons_state < NFS_CS_READY) { + error = clp->cl_cons_state; + nfs_put_client(clp); + return ERR_PTR(error); + } + + smp_rmb(); + + BUG_ON(clp->cl_cons_state != NFS_CS_READY); + + dprintk("<-- %s found nfs_client %p for %s\n", + __func__, clp, cl_init->hostname ?: ""); + return clp; +} + /* * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist @@ -512,11 +559,9 @@ static struct nfs_client * nfs_get_client(const struct nfs_client_initdata *cl_init, const struct rpc_timeout *timeparms, const char *ip_addr, - rpc_authflavor_t authflavour, - int noresvport) + rpc_authflavor_t authflavour) { struct nfs_client *clp, *new = NULL; - int error; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); dprintk("--> nfs_get_client(%s,v%u)\n", @@ -527,60 +572,29 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, spin_lock(&nn->nfs_client_lock); clp = nfs_match_client(cl_init); - if (clp) - goto found_client; - if (new) - goto install_client; + if (clp) { + spin_unlock(&nn->nfs_client_lock); + if (new) + nfs_free_client(new); + return nfs_found_client(cl_init, clp); + } + if (new) { + list_add(&new->cl_share_link, &nn->nfs_client_list); + spin_unlock(&nn->nfs_client_lock); + new->cl_flags = cl_init->init_flags; + return cl_init->rpc_ops->init_client(new, + timeparms, ip_addr, + authflavour); + } spin_unlock(&nn->nfs_client_lock); new = nfs_alloc_client(cl_init); } while (!IS_ERR(new)); - dprintk("--> nfs_get_client() = %ld [failed]\n", PTR_ERR(new)); + dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", + cl_init->hostname ?: "", PTR_ERR(new)); return new; - - /* install a new client and return with it unready */ -install_client: - clp = new; - list_add(&clp->cl_share_link, &nn->nfs_client_list); - spin_unlock(&nn->nfs_client_lock); - - error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr, - authflavour, noresvport); - if (error < 0) { - nfs_put_client(clp); - return ERR_PTR(error); - } - dprintk("--> nfs_get_client() = %p [new]\n", clp); - return clp; - - /* found an existing client - * - make sure it's ready before returning - */ -found_client: - spin_unlock(&nn->nfs_client_lock); - - if (new) - nfs_free_client(new); - - error = wait_event_killable(nfs_client_active_wq, - clp->cl_cons_state < NFS_CS_INITING); - if (error < 0) { - nfs_put_client(clp); - return ERR_PTR(-ERESTARTSYS); - } - - if (clp->cl_cons_state < NFS_CS_READY) { - error = clp->cl_cons_state; - nfs_put_client(clp); - return ERR_PTR(error); - } - - BUG_ON(clp->cl_cons_state != NFS_CS_READY); - - dprintk("--> nfs_get_client() = %p [share]\n", clp); - return clp; } /* @@ -588,27 +602,12 @@ found_client: */ void nfs_mark_client_ready(struct nfs_client *clp, int state) { + smp_wmb(); clp->cl_cons_state = state; wake_up_all(&nfs_client_active_wq); } /* - * With sessions, the client is not marked ready until after a - * successful EXCHANGE_ID and CREATE_SESSION. - * - * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate - * other versions of NFS can be tried. - */ -int nfs4_check_client_ready(struct nfs_client *clp) -{ - if (!nfs4_has_session(clp)) - return 0; - if (clp->cl_cons_state < NFS_CS_READY) - return -EPROTONOSUPPORT; - return 0; -} - -/* * Initialise the timeout values for a connection */ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, @@ -654,12 +653,11 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, */ static int nfs_create_rpc_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, - rpc_authflavor_t flavor, - int discrtry, int noresvport) + rpc_authflavor_t flavor) { struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { - .net = clp->net, + .net = clp->cl_net, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, @@ -670,9 +668,9 @@ static int nfs_create_rpc_client(struct nfs_client *clp, .authflavor = flavor, }; - if (discrtry) + if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; - if (noresvport) + if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (!IS_ERR(clp->cl_rpcclient)) @@ -713,7 +711,7 @@ static int nfs_start_lockd(struct nfs_server *server) .nfs_version = clp->rpc_ops->version, .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, - .net = clp->net, + .net = clp->cl_net, }; if (nlm_init.nfs_version > 3) @@ -805,36 +803,43 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, return 0; } -/* - * Initialise an NFS2 or NFS3 client +/** + * nfs_init_client - Initialise an NFS2 or NFS3 client + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: IP presentation address (not used) + * @authflavor: authentication flavor for underlying RPC transport + * + * Returns pointer to an NFS client, or an ERR_PTR value. */ -int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour, - int noresvport) +struct nfs_client *nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour) { int error; if (clp->cl_cons_state == NFS_CS_READY) { /* the client is already initialised */ dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); - return 0; + return clp; } /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, - 0, noresvport); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); - return 0; + return clp; error: nfs_mark_client_ready(clp, error); + nfs_put_client(clp); dprintk("<-- nfs_init_client() = xerror %d\n", error); - return error; + return ERR_PTR(error); } /* @@ -847,7 +852,7 @@ static int nfs_init_server(struct nfs_server *server, .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, .addrlen = data->nfs_server.addrlen, - .rpc_ops = &nfs_v2_clientops, + .rpc_ops = NULL, .proto = data->nfs_server.protocol, .net = data->net, }; @@ -857,17 +862,28 @@ static int nfs_init_server(struct nfs_server *server, dprintk("--> nfs_init_server()\n"); + switch (data->version) { +#ifdef CONFIG_NFS_V2 + case 2: + cl_init.rpc_ops = &nfs_v2_clientops; + break; +#endif #ifdef CONFIG_NFS_V3 - if (data->version == 3) + case 3: cl_init.rpc_ops = &nfs_v3_clientops; + break; #endif + default: + return -EPROTONOSUPPORT; + } nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); + if (data->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX, - data->flags & NFS_MOUNT_NORESVPORT); + clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); @@ -880,7 +896,7 @@ static int nfs_init_server(struct nfs_server *server, server->options = data->options; server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -1048,7 +1064,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve static void nfs_server_insert_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; - struct nfs_net *nn = net_generic(clp->net, nfs_net_id); + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); @@ -1065,7 +1081,7 @@ static void nfs_server_remove_lists(struct nfs_server *server) if (clp == NULL) return; - nn = net_generic(clp->net, nfs_net_id); + nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_del_rcu(&server->client_link); if (list_empty(&clp->cl_superblocks)) @@ -1333,21 +1349,27 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * so that the client back channel can find the * nfs_client struct */ - clp->cl_cons_state = NFS_CS_SESSION_INITING; + nfs_mark_client_ready(clp, NFS_CS_SESSION_INITING); } #endif /* CONFIG_NFS_V4_1 */ return nfs4_init_callback(clp); } -/* - * Initialise an NFS4 client record +/** + * nfs4_init_client - Initialise an NFS4 client record + * + * @clp: nfs_client to initialise + * @timeparms: timeout parameters for underlying RPC transport + * @ip_addr: callback IP address in presentation format + * @authflavor: authentication flavor for underlying RPC transport + * + * Returns pointer to an NFS client, or an ERR_PTR value. */ -int nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr, - rpc_authflavor_t authflavour, - int noresvport) +struct nfs_client *nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour) { char buf[INET6_ADDRSTRLEN + 1]; int error; @@ -1355,14 +1377,14 @@ int nfs4_init_client(struct nfs_client *clp, if (clp->cl_cons_state == NFS_CS_READY) { /* the client is initialised already */ dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); - return 0; + return clp; } /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; - error = nfs_create_rpc_client(clp, timeparms, authflavour, - 1, noresvport); + __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + error = nfs_create_rpc_client(clp, timeparms, authflavour); if (error < 0) goto error; @@ -1395,12 +1417,13 @@ int nfs4_init_client(struct nfs_client *clp, if (!nfs4_has_session(clp)) nfs_mark_client_ready(clp, NFS_CS_READY); - return 0; + return clp; error: nfs_mark_client_ready(clp, error); + nfs_put_client(clp); dprintk("<-- nfs4_init_client() = xerror %d\n", error); - return error; + return ERR_PTR(error); } /* @@ -1429,9 +1452,11 @@ static int nfs4_set_client(struct nfs_server *server, dprintk("--> nfs4_set_client()\n"); + if (server->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour, - server->flags & NFS_MOUNT_NORESVPORT); + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; @@ -1465,8 +1490,8 @@ error: * the MDS. */ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, - const struct sockaddr *ds_addr, - int ds_addrlen, int ds_proto) + const struct sockaddr *ds_addr, int ds_addrlen, + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct nfs_client_initdata cl_init = { .addr = ds_addr, @@ -1474,14 +1499,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, .rpc_ops = &nfs_v4_clientops, .proto = ds_proto, .minorversion = mds_clp->cl_minorversion, - .net = mds_clp->net, - }; - struct rpc_timeout ds_timeout = { - .to_initval = 15 * HZ, - .to_maxval = 15 * HZ, - .to_retries = 1, - .to_exponential = 1, + .net = mds_clp->cl_net, }; + struct rpc_timeout ds_timeout; struct nfs_client *clp; /* @@ -1489,8 +1509,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS * (section 13.1 RFC 5661). */ + nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - mds_clp->cl_rpcclient->cl_auth->au_flavor, 0); + mds_clp->cl_rpcclient->cl_auth->au_flavor); dprintk("<-- %s %p\n", __func__, clp); return clp; @@ -1701,7 +1722,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, rpc_protocol(parent_server->client), parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, - parent_client->net); + parent_client->cl_net); if (error < 0) goto error; @@ -1805,6 +1826,7 @@ void nfs_clients_init(struct net *net) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); + nn->boot_time = CURRENT_TIME; } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 89af1d2..bd3a960 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -316,6 +316,10 @@ out: * nfs_client_return_marked_delegations - return previously marked delegations * @clp: nfs_client to process * + * Note that this function is designed to be called by the state + * manager thread. For this reason, it cannot flush the dirty data, + * since that could deadlock in case of a state recovery error. + * * Returns zero on success, or a negative errno value. */ int nfs_client_return_marked_delegations(struct nfs_client *clp) @@ -340,11 +344,9 @@ restart: server); rcu_read_unlock(); - if (delegation != NULL) { - filemap_flush(inode->i_mapping); + if (delegation != NULL) err = __nfs_inode_return_delegation(inode, delegation, 0); - } iput(inode); if (!err) goto restart; @@ -380,6 +382,10 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) * nfs_inode_return_delegation - synchronously return a delegation * @inode: inode to process * + * This routine will always flush any dirty data to disk on the + * assumption that if we need to return the delegation, then + * we should stop caching. + * * Returns zero on success, or a negative errno value. */ int nfs_inode_return_delegation(struct inode *inode) @@ -389,10 +395,10 @@ int nfs_inode_return_delegation(struct inode *inode) struct nfs_delegation *delegation; int err = 0; + nfs_wb_all(inode); if (rcu_access_pointer(nfsi->delegation) != NULL) { delegation = nfs_detach_delegation(nfsi, server); if (delegation != NULL) { - nfs_wb_all(inode); err = __nfs_inode_return_delegation(inode, delegation, 1); } } @@ -538,6 +544,8 @@ int nfs_async_inode_return_delegation(struct inode *inode, struct nfs_client *clp = server->nfs_client; struct nfs_delegation *delegation; + filemap_flush(inode->i_mapping); + rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index cd6a7a8..72709c4 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -66,6 +66,7 @@ static inline int nfs_have_delegation(struct inode *inode, fmode_t flags) static inline int nfs_inode_return_delegation(struct inode *inode) { + nfs_wb_all(inode); return 0; } #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8789210..0989a20 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -475,12 +475,32 @@ different: } static +bool nfs_use_readdirplus(struct inode *dir, struct file *filp) +{ + if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) + return false; + if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) + return true; + if (filp->f_pos == 0) + return true; + return false; +} + +/* + * This function is called by the lookup code to request the use of + * readdirplus to accelerate any future lookups in the same + * directory. + */ +static +void nfs_advise_use_readdirplus(struct inode *dir) +{ + set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); +} + +static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { - struct qstr filename = { - .len = entry->len, - .name = entry->name, - }; + struct qstr filename = QSTR_INIT(entry->name, entry->len); struct dentry *dentry; struct dentry *alias; struct inode *dir = parent->d_inode; @@ -874,7 +894,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) desc->file = filp; desc->dir_cookie = &dir_ctx->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; - desc->plus = NFS_USE_READDIRPLUS(inode); + desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0; nfs_block_sillyrename(dentry); res = nfs_revalidate_mapping(inode, filp->f_mapping); @@ -1114,7 +1134,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) if (!inode) { if (nfs_neg_need_reval(dir, dentry, nd)) goto out_bad; - goto out_valid; + goto out_valid_noent; } if (is_bad_inode(inode)) { @@ -1143,7 +1163,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) if (fhandle == NULL || fattr == NULL) goto out_error; - error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); if (error) goto out_bad; if (nfs_compare_fh(NFS_FH(inode), fhandle)) @@ -1156,6 +1176,9 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: + /* Success: notify readdir to use READDIRPLUS */ + nfs_advise_use_readdirplus(dir); + out_valid_noent: dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n", __func__, dentry->d_parent->d_name.name, @@ -1299,7 +1322,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); if (error == -ENOENT) goto no_entry; if (error < 0) { @@ -1311,6 +1334,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru if (IS_ERR(res)) goto out_unblock_sillyrename; + /* Success: notify readdir to use READDIRPLUS */ + nfs_advise_use_readdirplus(dir); + no_entry: res = d_materialise_unique(dentry, inode); if (res != NULL) { @@ -1646,7 +1672,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, if (dentry->d_inode) goto out; if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); if (error) goto out_error; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 481be7f..23d170b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -56,6 +56,7 @@ #include "internal.h" #include "iostat.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -81,16 +82,19 @@ struct nfs_direct_req { struct completion completion; /* wait for i/o completion */ /* commit state */ - struct list_head rewrite_list; /* saved nfs_write_data structs */ - struct nfs_write_data * commit_data; /* special write_data for commits */ + struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ + struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ + struct work_struct work; int flags; #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ struct nfs_writeverf verf; /* unstable write verifier */ }; +static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; +static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); -static const struct rpc_call_ops nfs_write_direct_ops; +static void nfs_direct_write_schedule_work(struct work_struct *work); static inline void get_dreq(struct nfs_direct_req *dreq) { @@ -124,22 +128,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_ return -EINVAL; } -static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count) -{ - unsigned int npages; - unsigned int i; - - if (count == 0) - return; - pages += (pgbase >> PAGE_SHIFT); - npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; - for (i = 0; i < npages; i++) { - struct page *page = pages[i]; - if (!PageCompound(page)) - set_page_dirty(page); - } -} - static void nfs_direct_release_pages(struct page **pages, unsigned int npages) { unsigned int i; @@ -147,26 +135,30 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages) page_cache_release(pages[i]); } +void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, + struct nfs_direct_req *dreq) +{ + cinfo->lock = &dreq->lock; + cinfo->mds = &dreq->mds_cinfo; + cinfo->ds = &dreq->ds_cinfo; + cinfo->dreq = dreq; + cinfo->completion_ops = &nfs_direct_commit_completion_ops; +} + static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { struct nfs_direct_req *dreq; - dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL); + dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL); if (!dreq) return NULL; kref_init(&dreq->kref); kref_get(&dreq->kref); init_completion(&dreq->completion); - INIT_LIST_HEAD(&dreq->rewrite_list); - dreq->iocb = NULL; - dreq->ctx = NULL; - dreq->l_ctx = NULL; + INIT_LIST_HEAD(&dreq->mds_cinfo.list); + INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); spin_lock_init(&dreq->lock); - atomic_set(&dreq->io_count, 0); - dreq->count = 0; - dreq->error = 0; - dreq->flags = 0; return dreq; } @@ -226,47 +218,80 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) nfs_direct_req_release(dreq); } -/* - * We must hold a reference to all the pages in this direct read request - * until the RPCs complete. This could be long *after* we are woken up in - * nfs_direct_wait (for instance, if someone hits ^C on a slow server). - */ -static void nfs_direct_read_result(struct rpc_task *task, void *calldata) +static void nfs_direct_readpage_release(struct nfs_page *req) { - struct nfs_read_data *data = calldata; - - nfs_readpage_result(task, data); + dprintk("NFS: direct read done (%s/%lld %d@%lld)\n", + req->wb_context->dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + nfs_release_request(req); } -static void nfs_direct_read_release(void *calldata) +static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) { + unsigned long bytes = 0; + struct nfs_direct_req *dreq = hdr->dreq; - struct nfs_read_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - int status = data->task.tk_status; + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out_put; spin_lock(&dreq->lock); - if (unlikely(status < 0)) { - dreq->error = status; - spin_unlock(&dreq->lock); - } else { - dreq->count += data->res.count; - spin_unlock(&dreq->lock); - nfs_direct_dirty_pages(data->pagevec, - data->args.pgbase, - data->res.count); - } - nfs_direct_release_pages(data->pagevec, data->npages); + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) + dreq->error = hdr->error; + else + dreq->count += hdr->good_bytes; + spin_unlock(&dreq->lock); + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct page *page = req->wb_page; + + if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { + if (bytes > hdr->good_bytes) + zero_user(page, 0, PAGE_SIZE); + else if (hdr->good_bytes - bytes < PAGE_SIZE) + zero_user_segment(page, + hdr->good_bytes & ~PAGE_MASK, + PAGE_SIZE); + } + if (!PageCompound(page)) { + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { + if (bytes < hdr->good_bytes) + set_page_dirty(page); + } else + set_page_dirty(page); + } + bytes += req->wb_bytes; + nfs_list_remove_request(req); + nfs_direct_readpage_release(req); + } +out_put: if (put_dreq(dreq)) nfs_direct_complete(dreq); - nfs_readdata_free(data); + hdr->release(hdr); +} + +static void nfs_read_sync_pgio_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_release_request(req); + } } -static const struct rpc_call_ops nfs_read_direct_ops = { - .rpc_call_prepare = nfs_read_prepare, - .rpc_call_done = nfs_direct_read_result, - .rpc_release = nfs_direct_read_release, +static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) +{ + get_dreq(hdr->dreq); +} + +static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { + .error_cleanup = nfs_read_sync_pgio_error, + .init_hdr = nfs_direct_pgio_init, + .completion = nfs_direct_read_completion, }; /* @@ -276,107 +301,82 @@ static const struct rpc_call_ops nfs_read_direct_ops = { * handled automatically by nfs_direct_read_result(). Otherwise, if * no requests have been sent, just return an error. */ -static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, +static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, const struct iovec *iov, loff_t pos) { + struct nfs_direct_req *dreq = desc->pg_dreq; struct nfs_open_context *ctx = dreq->ctx; struct inode *inode = ctx->dentry->d_inode; unsigned long user_addr = (unsigned long)iov->iov_base; size_t count = iov->iov_len; size_t rsize = NFS_SERVER(inode)->rsize; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = ctx->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), - .rpc_message = &msg, - .callback_ops = &nfs_read_direct_ops, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, - }; unsigned int pgbase; int result; ssize_t started = 0; + struct page **pagevec = NULL; + unsigned int npages; do { - struct nfs_read_data *data; size_t bytes; + int i; pgbase = user_addr & ~PAGE_MASK; - bytes = min(rsize,count); + bytes = min(max_t(size_t, rsize, PAGE_SIZE), count); result = -ENOMEM; - data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes)); - if (unlikely(!data)) + npages = nfs_page_array_len(pgbase, bytes); + if (!pagevec) + pagevec = kmalloc(npages * sizeof(struct page *), + GFP_KERNEL); + if (!pagevec) break; - down_read(¤t->mm->mmap_sem); result = get_user_pages(current, current->mm, user_addr, - data->npages, 1, 0, data->pagevec, NULL); + npages, 1, 0, pagevec, NULL); up_read(¤t->mm->mmap_sem); - if (result < 0) { - nfs_readdata_free(data); + if (result < 0) break; - } - if ((unsigned)result < data->npages) { + if ((unsigned)result < npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { - nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_free(data); + nfs_direct_release_pages(pagevec, result); break; } bytes -= pgbase; - data->npages = result; + npages = result; } - get_dreq(dreq); - - data->req = (struct nfs_page *) dreq; - data->inode = inode; - data->cred = msg.rpc_cred; - data->args.fh = NFS_FH(inode); - data->args.context = ctx; - data->args.lock_context = dreq->l_ctx; - data->args.offset = pos; - data->args.pgbase = pgbase; - data->args.pages = data->pagevec; - data->args.count = bytes; - data->res.fattr = &data->fattr; - data->res.eof = 0; - data->res.count = bytes; - nfs_fattr_init(&data->fattr); - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - NFS_PROTO(inode)->read_setup(data, &msg); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - break; - rpc_put_task(task); - - dprintk("NFS: %5u initiated direct read call " - "(req %s/%Ld, %zu bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - bytes, - (unsigned long long)data->args.offset); - - started += bytes; - user_addr += bytes; - pos += bytes; - /* FIXME: Remove this unnecessary math from final patch */ - pgbase += bytes; - pgbase &= ~PAGE_MASK; - BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); - - count -= bytes; - } while (count != 0); + for (i = 0; i < npages; i++) { + struct nfs_page *req; + unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + /* XXX do we need to do the eof zeroing found in async_filler? */ + req = nfs_create_request(dreq->ctx, dreq->inode, + pagevec[i], + pgbase, req_len); + if (IS_ERR(req)) { + result = PTR_ERR(req); + break; + } + req->wb_index = pos >> PAGE_SHIFT; + req->wb_offset = pos & ~PAGE_MASK; + if (!nfs_pageio_add_request(desc, req)) { + result = desc->pg_error; + nfs_release_request(req); + break; + } + pgbase = 0; + bytes -= req_len; + started += req_len; + user_addr += req_len; + pos += req_len; + count -= req_len; + } + /* The nfs_page now hold references to these pages */ + nfs_direct_release_pages(pagevec, npages); + } while (count != 0 && result >= 0); + + kfree(pagevec); if (started) return started; @@ -388,15 +388,19 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, unsigned long nr_segs, loff_t pos) { + struct nfs_pageio_descriptor desc; ssize_t result = -EINVAL; size_t requested_bytes = 0; unsigned long seg; + nfs_pageio_init_read(&desc, dreq->inode, + &nfs_direct_read_completion_ops); get_dreq(dreq); + desc.pg_dreq = dreq; for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; - result = nfs_direct_read_schedule_segment(dreq, vec, pos); + result = nfs_direct_read_schedule_segment(&desc, vec, pos); if (result < 0) break; requested_bytes += result; @@ -405,6 +409,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, pos += vec->iov_len; } + nfs_pageio_complete(&desc); + /* * If no bytes were started, return the error, and let the * generic layer handle the completion. @@ -441,104 +447,64 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); if (!result) result = nfs_direct_wait(dreq); + NFS_I(inode)->read_io += result; out_release: nfs_direct_req_release(dreq); out: return result; } -static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) -{ - while (!list_empty(&dreq->rewrite_list)) { - struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); - list_del(&data->pages); - nfs_direct_release_pages(data->pagevec, data->npages); - nfs_writedata_free(data); - } -} - #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { - struct inode *inode = dreq->inode; - struct list_head *p; - struct nfs_write_data *data; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = dreq->ctx->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), - .rpc_message = &msg, - .callback_ops = &nfs_write_direct_ops, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, - }; + struct nfs_pageio_descriptor desc; + struct nfs_page *req, *tmp; + LIST_HEAD(reqs); + struct nfs_commit_info cinfo; + LIST_HEAD(failed); + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo); + spin_lock(cinfo.lock); + nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0); + spin_unlock(cinfo.lock); dreq->count = 0; get_dreq(dreq); - list_for_each(p, &dreq->rewrite_list) { - data = list_entry(p, struct nfs_write_data, pages); - - get_dreq(dreq); - - /* Use stable writes */ - data->args.stable = NFS_FILE_SYNC; - - /* - * Reset data->res. - */ - nfs_fattr_init(&data->fattr); - data->res.count = data->args.count; - memset(&data->verf, 0, sizeof(data->verf)); - - /* - * Reuse data->task; data->args should not have changed - * since the original request was sent. - */ - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - NFS_PROTO(inode)->write_setup(data, &msg); - - /* - * We're called via an RPC callback, so BKL is already held. - */ - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); - - dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); + nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, + &nfs_direct_write_completion_ops); + desc.pg_dreq = dreq; + + list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + if (!nfs_pageio_add_request(&desc, req)) { + nfs_list_add_request(req, &failed); + spin_lock(cinfo.lock); + dreq->flags = 0; + dreq->error = -EIO; + spin_unlock(cinfo.lock); + } } + nfs_pageio_complete(&desc); - if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, inode); -} - -static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; + while (!list_empty(&failed)) + nfs_unlock_and_release_request(req); - /* Call the NFS version-specific code */ - NFS_PROTO(data->inode)->commit_done(task, data); + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, dreq->inode); } -static void nfs_direct_commit_release(void *calldata) +static void nfs_direct_commit_complete(struct nfs_commit_data *data) { - struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + struct nfs_direct_req *dreq = data->dreq; + struct nfs_commit_info cinfo; + struct nfs_page *req; int status = data->task.tk_status; + nfs_init_cinfo_from_dreq(&cinfo, dreq); if (status < 0) { dprintk("NFS: %5u commit failed with error %d.\n", - data->task.tk_pid, status); + data->task.tk_pid, status); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); @@ -546,62 +512,47 @@ static void nfs_direct_commit_release(void *calldata) } dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); - nfs_direct_write_complete(dreq, data->inode); - nfs_commit_free(data); + while (!list_empty(&data->pages)) { + req = nfs_list_entry(data->pages.next); + nfs_list_remove_request(req); + if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { + /* Note the rewrite will go through mds */ + kref_get(&req->wb_kref); + nfs_mark_request_commit(req, NULL, &cinfo); + } + nfs_unlock_and_release_request(req); + } + + if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) + nfs_direct_write_complete(dreq, data->inode); +} + +static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) +{ + /* There is no lock to clear */ } -static const struct rpc_call_ops nfs_commit_direct_ops = { - .rpc_call_prepare = nfs_write_prepare, - .rpc_call_done = nfs_direct_commit_result, - .rpc_release = nfs_direct_commit_release, +static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { + .completion = nfs_direct_commit_complete, + .error_cleanup = nfs_direct_error_cleanup, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) { - struct nfs_write_data *data = dreq->commit_data; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = dreq->ctx->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, - .rpc_client = NFS_CLIENT(dreq->inode), - .rpc_message = &msg, - .callback_ops = &nfs_commit_direct_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, - }; - - data->inode = dreq->inode; - data->cred = msg.rpc_cred; - - data->args.fh = NFS_FH(data->inode); - data->args.offset = 0; - data->args.count = 0; - data->args.context = dreq->ctx; - data->args.lock_context = dreq->l_ctx; - data->res.count = 0; - data->res.fattr = &data->fattr; - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); - - NFS_PROTO(data->inode)->commit_setup(data, &msg); - - /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ - dreq->commit_data = NULL; - - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); - - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + int res; + struct nfs_commit_info cinfo; + LIST_HEAD(mds_list); + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_scan_commit(dreq->inode, &mds_list, &cinfo); + res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); + if (res < 0) /* res == -ENOMEM */ + nfs_direct_write_reschedule(dreq); } -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +static void nfs_direct_write_schedule_work(struct work_struct *work) { + struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); int flags = dreq->flags; dreq->flags = 0; @@ -613,89 +564,32 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode nfs_direct_write_reschedule(dreq); break; default: - if (dreq->commit_data != NULL) - nfs_commit_free(dreq->commit_data); - nfs_direct_free_writedata(dreq); - nfs_zap_mapping(inode, inode->i_mapping); + nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); nfs_direct_complete(dreq); } } -static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) { - dreq->commit_data = nfs_commitdata_alloc(); - if (dreq->commit_data != NULL) - dreq->commit_data->req = (struct nfs_page *) dreq; + schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ } + #else -static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq) +static void nfs_direct_write_schedule_work(struct work_struct *work) { - dreq->commit_data = NULL; } static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) { - nfs_direct_free_writedata(dreq); nfs_zap_mapping(inode, inode->i_mapping); nfs_direct_complete(dreq); } #endif -static void nfs_direct_write_result(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; - - nfs_writeback_done(task, data); -} - /* * NB: Return the value of the first error return code. Subsequent * errors after the first one are ignored. */ -static void nfs_direct_write_release(void *calldata) -{ - struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - int status = data->task.tk_status; - - spin_lock(&dreq->lock); - - if (unlikely(status < 0)) { - /* An error has occurred, so we should not commit */ - dreq->flags = 0; - dreq->error = status; - } - if (unlikely(dreq->error != 0)) - goto out_unlock; - - dreq->count += data->res.count; - - if (data->res.verf->committed != NFS_FILE_SYNC) { - switch (dreq->flags) { - case 0: - memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf)); - dreq->flags = NFS_ODIRECT_DO_COMMIT; - break; - case NFS_ODIRECT_DO_COMMIT: - if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { - dprintk("NFS: %5u write verify failed\n", data->task.tk_pid); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } - } - } -out_unlock: - spin_unlock(&dreq->lock); - - if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, data->inode); -} - -static const struct rpc_call_ops nfs_write_direct_ops = { - .rpc_call_prepare = nfs_write_prepare, - .rpc_call_done = nfs_direct_write_result, - .rpc_release = nfs_direct_write_release, -}; - /* * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE * operation. If nfs_writedata_alloc() or get_user_pages() fails, @@ -703,132 +597,187 @@ static const struct rpc_call_ops nfs_write_direct_ops = { * handled automatically by nfs_direct_write_result(). Otherwise, if * no requests have been sent, just return an error. */ -static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, +static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, const struct iovec *iov, - loff_t pos, int sync) + loff_t pos) { + struct nfs_direct_req *dreq = desc->pg_dreq; struct nfs_open_context *ctx = dreq->ctx; struct inode *inode = ctx->dentry->d_inode; unsigned long user_addr = (unsigned long)iov->iov_base; size_t count = iov->iov_len; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_cred = ctx->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), - .rpc_message = &msg, - .callback_ops = &nfs_write_direct_ops, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, - }; size_t wsize = NFS_SERVER(inode)->wsize; unsigned int pgbase; int result; ssize_t started = 0; + struct page **pagevec = NULL; + unsigned int npages; do { - struct nfs_write_data *data; size_t bytes; + int i; pgbase = user_addr & ~PAGE_MASK; - bytes = min(wsize,count); + bytes = min(max_t(size_t, wsize, PAGE_SIZE), count); result = -ENOMEM; - data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes)); - if (unlikely(!data)) + npages = nfs_page_array_len(pgbase, bytes); + if (!pagevec) + pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL); + if (!pagevec) break; down_read(¤t->mm->mmap_sem); result = get_user_pages(current, current->mm, user_addr, - data->npages, 0, 0, data->pagevec, NULL); + npages, 0, 0, pagevec, NULL); up_read(¤t->mm->mmap_sem); - if (result < 0) { - nfs_writedata_free(data); + if (result < 0) break; - } - if ((unsigned)result < data->npages) { + + if ((unsigned)result < npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { - nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_free(data); + nfs_direct_release_pages(pagevec, result); break; } bytes -= pgbase; - data->npages = result; + npages = result; } - get_dreq(dreq); - - list_move_tail(&data->pages, &dreq->rewrite_list); - - data->req = (struct nfs_page *) dreq; - data->inode = inode; - data->cred = msg.rpc_cred; - data->args.fh = NFS_FH(inode); - data->args.context = ctx; - data->args.lock_context = dreq->l_ctx; - data->args.offset = pos; - data->args.pgbase = pgbase; - data->args.pages = data->pagevec; - data->args.count = bytes; - data->args.stable = sync; - data->res.fattr = &data->fattr; - data->res.count = bytes; - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); - - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - NFS_PROTO(inode)->write_setup(data, &msg); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - break; - rpc_put_task(task); - - dprintk("NFS: %5u initiated direct write call " - "(req %s/%Ld, %zu bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - bytes, - (unsigned long long)data->args.offset); + for (i = 0; i < npages; i++) { + struct nfs_page *req; + unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - started += bytes; - user_addr += bytes; - pos += bytes; - - /* FIXME: Remove this useless math from the final patch */ - pgbase += bytes; - pgbase &= ~PAGE_MASK; - BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); + req = nfs_create_request(dreq->ctx, dreq->inode, + pagevec[i], + pgbase, req_len); + if (IS_ERR(req)) { + result = PTR_ERR(req); + break; + } + nfs_lock_request(req); + req->wb_index = pos >> PAGE_SHIFT; + req->wb_offset = pos & ~PAGE_MASK; + if (!nfs_pageio_add_request(desc, req)) { + result = desc->pg_error; + nfs_unlock_and_release_request(req); + break; + } + pgbase = 0; + bytes -= req_len; + started += req_len; + user_addr += req_len; + pos += req_len; + count -= req_len; + } + /* The nfs_page now hold references to these pages */ + nfs_direct_release_pages(pagevec, npages); + } while (count != 0 && result >= 0); - count -= bytes; - } while (count != 0); + kfree(pagevec); if (started) return started; return result < 0 ? (ssize_t) result : -EFAULT; } +static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) +{ + struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_commit_info cinfo; + int bit = -1; + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out_put; + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + + spin_lock(&dreq->lock); + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { + dreq->flags = 0; + dreq->error = hdr->error; + } + if (dreq->error != 0) + bit = NFS_IOHDR_ERROR; + else { + dreq->count += hdr->good_bytes; + if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + bit = NFS_IOHDR_NEED_RESCHED; + } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) + bit = NFS_IOHDR_NEED_RESCHED; + else if (dreq->flags == 0) { + memcpy(&dreq->verf, &req->wb_verf, + sizeof(dreq->verf)); + bit = NFS_IOHDR_NEED_COMMIT; + dreq->flags = NFS_ODIRECT_DO_COMMIT; + } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { + if (memcmp(&dreq->verf, &req->wb_verf, sizeof(dreq->verf))) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + bit = NFS_IOHDR_NEED_RESCHED; + } else + bit = NFS_IOHDR_NEED_COMMIT; + } + } + } + spin_unlock(&dreq->lock); + + while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + switch (bit) { + case NFS_IOHDR_NEED_RESCHED: + case NFS_IOHDR_NEED_COMMIT: + kref_get(&req->wb_kref); + nfs_mark_request_commit(req, hdr->lseg, &cinfo); + } + nfs_unlock_and_release_request(req); + } + +out_put: + if (put_dreq(dreq)) + nfs_direct_write_complete(dreq, hdr->inode); + hdr->release(hdr); +} + +static void nfs_write_sync_pgio_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_unlock_and_release_request(req); + } +} + +static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { + .error_cleanup = nfs_write_sync_pgio_error, + .init_hdr = nfs_direct_pgio_init, + .completion = nfs_direct_write_completion, +}; + static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, const struct iovec *iov, unsigned long nr_segs, - loff_t pos, int sync) + loff_t pos) { + struct nfs_pageio_descriptor desc; ssize_t result = 0; size_t requested_bytes = 0; unsigned long seg; + nfs_pageio_init_write(&desc, dreq->inode, FLUSH_COND_STABLE, + &nfs_direct_write_completion_ops); + desc.pg_dreq = dreq; get_dreq(dreq); for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; - result = nfs_direct_write_schedule_segment(dreq, vec, - pos, sync); + result = nfs_direct_write_schedule_segment(&desc, vec, pos); if (result < 0) break; requested_bytes += result; @@ -836,6 +785,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; pos += vec->iov_len; } + nfs_pageio_complete(&desc); + NFS_I(dreq->inode)->write_io += desc.pg_bytes_written; /* * If no bytes were started, return the error, and let the @@ -858,16 +809,10 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; - size_t wsize = NFS_SERVER(inode)->wsize; - int sync = NFS_UNSTABLE; dreq = nfs_direct_req_alloc(); if (!dreq) goto out; - nfs_alloc_commit_data(dreq); - - if (dreq->commit_data == NULL || count <= wsize) - sync = NFS_FILE_SYNC; dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); @@ -877,7 +822,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos); if (!result) result = nfs_direct_wait(dreq); out_release: @@ -997,10 +942,15 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, task_io_account_write(count); retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); + if (retval > 0) { + struct inode *inode = mapping->host; - if (retval > 0) iocb->ki_pos = pos + retval; - + spin_lock(&inode->i_lock); + if (i_size_read(inode) < iocb->ki_pos) + i_size_write(inode, iocb->ki_pos); + spin_unlock(&inode->i_lock); + } out: return retval; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index aa9b709..56311ca 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -174,6 +174,13 @@ nfs_file_flush(struct file *file, fl_owner_t id) if ((file->f_mode & FMODE_WRITE) == 0) return 0; + /* + * If we're holding a write delegation, then just start the i/o + * but don't wait for completion (or send a commit). + */ + if (nfs_have_delegation(inode, FMODE_WRITE)) + return filemap_fdatawrite(file->f_mapping); + /* Flush writes to the server and return any errors */ return vfs_fsync(file, 0); } @@ -417,6 +424,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, if (status < 0) return status; + NFS_I(mapping->host)->write_io += copied; return copied; } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index ae65c16..c817787 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -64,23 +64,12 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp) * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent * superblock across an automount point of some nature. */ -void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, - struct nfs_clone_mount *mntdata) +void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) { struct nfs_fscache_key *key, *xkey; struct nfs_server *nfss = NFS_SB(sb); struct rb_node **p, *parent; - int diff, ulen; - - if (uniq) { - ulen = strlen(uniq); - } else if (mntdata) { - struct nfs_server *mnt_s = NFS_SB(mntdata->sb); - if (mnt_s->fscache_key) { - uniq = mnt_s->fscache_key->key.uniquifier; - ulen = mnt_s->fscache_key->key.uniq_len; - } - } + int diff; if (!uniq) { uniq = ""; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index b9c572d..c5b11b5 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -73,9 +73,7 @@ extern void nfs_fscache_unregister(void); extern void nfs_fscache_get_client_cookie(struct nfs_client *); extern void nfs_fscache_release_client_cookie(struct nfs_client *); -extern void nfs_fscache_get_super_cookie(struct super_block *, - const char *, - struct nfs_clone_mount *); +extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); extern void nfs_fscache_release_super_cookie(struct super_block *); extern void nfs_fscache_init_inode_cookie(struct inode *); @@ -172,12 +170,6 @@ static inline void nfs_fscache_unregister(void) {} static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} -static inline void nfs_fscache_get_super_cookie( - struct super_block *sb, - const char *uniq, - struct nfs_clone_mount *mntdata) -{ -} static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {} diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 4ca6f5c..8abfb19 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -150,7 +150,7 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh) goto out; /* Start by getting the root filehandle from the server */ - ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo); if (ret < 0) { dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); goto out; @@ -178,87 +178,4 @@ out: return ret; } -/* - * get an NFS4 root dentry from the root filehandle - */ -struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh, - const char *devname) -{ - struct nfs_server *server = NFS_SB(sb); - struct nfs_fattr *fattr = NULL; - struct dentry *ret; - struct inode *inode; - void *name = kstrdup(devname, GFP_KERNEL); - int error; - - dprintk("--> nfs4_get_root()\n"); - - if (!name) - return ERR_PTR(-ENOMEM); - - /* get the info about the server and filesystem */ - error = nfs4_server_capabilities(server, mntfh); - if (error < 0) { - dprintk("nfs_get_root: getcaps error = %d\n", - -error); - kfree(name); - return ERR_PTR(error); - } - - fattr = nfs_alloc_fattr(); - if (fattr == NULL) { - kfree(name); - return ERR_PTR(-ENOMEM); - } - - /* get the actual root for this mount */ - error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); - if (error < 0) { - dprintk("nfs_get_root: getattr error = %d\n", -error); - ret = ERR_PTR(error); - goto out; - } - - if (fattr->valid & NFS_ATTR_FATTR_FSID && - !nfs_fsid_equal(&server->fsid, &fattr->fsid)) - memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); - - inode = nfs_fhget(sb, mntfh, fattr); - if (IS_ERR(inode)) { - dprintk("nfs_get_root: get root inode failed\n"); - ret = ERR_CAST(inode); - goto out; - } - - error = nfs_superblock_set_dummy_root(sb, inode); - if (error != 0) { - ret = ERR_PTR(error); - goto out; - } - - /* root dentries normally start off anonymous and get spliced in later - * if the dentry tree reaches them; however if the dentry already - * exists, we'll pick it up at this point and use it as the root - */ - ret = d_obtain_alias(inode); - if (IS_ERR(ret)) { - dprintk("nfs_get_root: get root dentry failed\n"); - goto out; - } - - security_d_instantiate(ret, inode); - spin_lock(&ret->d_lock); - if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { - ret->d_fsdata = name; - name = NULL; - } - spin_unlock(&ret->d_lock); -out: - if (name) - kfree(name); - nfs_free_fattr(fattr); - dprintk("<-- nfs4_get_root()\n"); - return ret; -} - #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index ba3019f..b5b86a0 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -415,7 +415,7 @@ static int __nfs_idmap_register(struct dentry *dir, static void nfs_idmap_unregister(struct nfs_client *clp, struct rpc_pipe *pipe) { - struct net *net = clp->net; + struct net *net = clp->cl_net; struct super_block *pipefs_sb; pipefs_sb = rpc_get_sb_net(net); @@ -429,7 +429,7 @@ static int nfs_idmap_register(struct nfs_client *clp, struct idmap *idmap, struct rpc_pipe *pipe) { - struct net *net = clp->net; + struct net *net = clp->cl_net; struct super_block *pipefs_sb; int err = 0; @@ -530,9 +530,25 @@ static struct nfs_client *nfs_get_client_for_event(struct net *net, int event) struct nfs_net *nn = net_generic(net, nfs_net_id); struct dentry *cl_dentry; struct nfs_client *clp; + int err; +restart: spin_lock(&nn->nfs_client_lock); list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { + /* Wait for initialisation to finish */ + if (clp->cl_cons_state == NFS_CS_INITING) { + atomic_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + err = nfs_wait_client_init_complete(clp); + nfs_put_client(clp); + if (err) + return NULL; + goto restart; + } + /* Skip nfs_clients that failed to initialise */ + if (clp->cl_cons_state < 0) + continue; + smp_rmb(); if (clp->rpc_ops != &nfs_v4_clientops) continue; cl_dentry = clp->cl_idmap->idmap_pipe->dentry; @@ -640,20 +656,16 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; struct key *key = cons->key; - int ret; + int ret = -ENOMEM; /* msg and im are freed in idmap_pipe_destroy_msg */ msg = kmalloc(sizeof(*msg), GFP_KERNEL); - if (IS_ERR(msg)) { - ret = PTR_ERR(msg); + if (!msg) goto out0; - } im = kmalloc(sizeof(*im), GFP_KERNEL); - if (IS_ERR(im)) { - ret = PTR_ERR(im); + if (!im) goto out1; - } ret = nfs_idmap_prepare_message(key->description, im, msg); if (ret < 0) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e8bbfa5..2f6f78c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -121,7 +121,7 @@ static void nfs_clear_inode(struct inode *inode) void nfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); nfs_clear_inode(inode); } @@ -285,9 +285,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_mode = fattr->mode; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -300,8 +298,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; inode->i_data.a_ops = &nfs_dir_aops; - if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) - set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { @@ -327,6 +323,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_gid = -2; inode->i_blocks = 0; memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + nfsi->write_io = 0; + nfsi->read_io = 0; nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; @@ -337,24 +335,19 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA | NFS_INO_REVAL_PAGECACHE; if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); @@ -363,15 +356,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -654,6 +643,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); + ctx->mdsthreshold = NULL; return ctx; } @@ -682,6 +672,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) put_rpccred(ctx->cred); dput(ctx->dentry); nfs_sb_deactive(sb); + kfree(ctx->mdsthreshold); kfree(ctx); } @@ -870,6 +861,15 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return 0; } +static bool nfs_mapping_need_revalidate_inode(struct inode *inode) +{ + if (nfs_have_delegated_attributes(inode)) + return false; + return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) + || nfs_attribute_timeout(inode) + || NFS_STALE(inode); +} + /** * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode @@ -880,9 +880,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) struct nfs_inode *nfsi = NFS_I(inode); int ret = 0; - if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) - || nfs_attribute_cache_expired(inode) - || NFS_STALE(inode)) { + if (nfs_mapping_need_revalidate_inode(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) goto out; @@ -948,6 +946,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat unsigned long invalid = 0; + if (nfs_have_delegated_attributes(inode)) + return 0; /* Has the inode gone and changed behind our back? */ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) return -EIO; @@ -960,7 +960,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); @@ -1279,14 +1279,26 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) - goto out_fileid; + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { + printk(KERN_ERR "NFS: server %s error: fileid changed\n" + "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, + inode->i_sb->s_id, (long long)nfsi->fileid, + (long long)fattr->fileid); + goto out_err; + } /* * Make sure the inode's type hasn't changed. */ - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) - goto out_changed; + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + /* + * Big trouble! The inode has become a different object. + */ + printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n", + __func__, inode->i_ino, inode->i_mode, fattr->mode); + goto out_err; + } server = NFS_SERVER(inode); /* Update the fsid? */ @@ -1314,7 +1326,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_PAGECACHE; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; @@ -1323,38 +1339,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { - /* NFSv2/v3: Check if the mtime agrees */ - if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { - dprintk("NFS: mtime change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - } + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } else if (server->caps & NFS_CAP_MTIME) invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_CTIME) { - /* If ctime has changed we should definitely clear access+acl caches */ - if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - /* and probably clear data for a directory too as utimes can cause - * havoc with our cache. - */ - if (S_ISDIR(inode->i_mode)) { - invalid |= NFS_INO_INVALID_DATA; - nfs_force_lookup_revalidate(inode); - } - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - } + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } else if (server->caps & NFS_CAP_CTIME) invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ @@ -1466,12 +1459,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->cache_validity |= invalid; return 0; - out_changed: - /* - * Big trouble! The inode has become a different object. - */ - printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n", - __func__, inode->i_ino, inode->i_mode, fattr->mode); out_err: /* * No need to worry about unhashing the dentry, as the @@ -1480,13 +1467,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfs_invalidate_inode(inode); return -ESTALE; - - out_fileid: - printk(KERN_ERR "NFS: server %s error: fileid changed\n" - "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", - NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, - (long long)nfsi->fileid, (long long)fattr->fileid); - goto out_err; } @@ -1500,7 +1480,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); pnfs_return_layout(inode); pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ @@ -1547,7 +1527,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; - atomic_set(&nfsi->commits_outstanding, 0); + atomic_set(&nfsi->commit_info.rpcs_out, 0); #endif } @@ -1559,9 +1539,9 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); - INIT_LIST_HEAD(&nfsi->commit_list); + INIT_LIST_HEAD(&nfsi->commit_info.list); nfsi->npages = 0; - nfsi->ncommit = 0; + nfsi->commit_info.ncommit = 0; atomic_set(&nfsi->silly_count, 1); INIT_HLIST_HEAD(&nfsi->silly_list); init_waitqueue_head(&nfsi->waitqueue); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b777bda..1848a72 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -103,6 +103,7 @@ struct nfs_parsed_mount_data { unsigned int version; unsigned int minorversion; char *fscache_uniq; + bool need_mount; struct { struct sockaddr_storage address; @@ -167,11 +168,13 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); +extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); -extern int nfs4_check_client_ready(struct nfs_client *clp); extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, const struct sockaddr *ds_addr, - int ds_addrlen, int ds_proto); + int ds_addrlen, int ds_proto, + unsigned int ds_timeo, + unsigned int ds_retrans); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -185,21 +188,11 @@ static inline void nfs_fs_proc_exit(void) } #endif -/* nfs4namespace.c */ -#ifdef CONFIG_NFS_V4 -extern struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry); -#else -static inline -struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry) -{ - return ERR_PTR(-ENOENT); -} -#endif - /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; +struct nfs_pageio_descriptor; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); @@ -210,9 +203,13 @@ extern void nfs_destroy_writepagecache(void); extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); +extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount); +extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, + void (*release)(struct nfs_pgio_header *hdr)); +void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); /* nfs2xdr.c */ -extern int nfs_stat_to_errno(enum nfs_stat); extern struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, struct nfs_entry *, int); @@ -237,14 +234,13 @@ extern const u32 nfs41_maxwrite_overhead; extern struct rpc_procinfo nfs4_procedures[]; #endif -extern int nfs4_init_ds_session(struct nfs_client *clp); +extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); -extern int nfs_init_client(struct nfs_client *clp, +extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour, - int noresvport); + const char *ip_addr, rpc_authflavor_t authflavour); /* dir.c */ extern int nfs_access_cache_shrinker(struct shrinker *shrink, @@ -280,9 +276,10 @@ extern void nfs_sb_deactive(struct super_block *sb); extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen); extern struct vfsmount *nfs_d_automount(struct path *path); -#ifdef CONFIG_NFS_V4 -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); -#endif +struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); +struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); /* getroot.c */ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, @@ -294,46 +291,73 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); #endif -struct nfs_pageio_descriptor; +struct nfs_pgio_completion_ops; /* read.c */ -extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, - const struct rpc_call_ops *call_ops); +extern struct nfs_read_header *nfs_readhdr_alloc(void); +extern void nfs_readhdr_free(struct nfs_pgio_header *hdr); +extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode, + const struct nfs_pgio_completion_ops *compl_ops); +extern int nfs_initiate_read(struct rpc_clnt *clnt, + struct nfs_read_data *data, + const struct rpc_call_ops *call_ops, int flags); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, - struct list_head *head); - + struct nfs_pgio_header *hdr); extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, - struct inode *inode); + struct inode *inode, + const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_readdata_release(struct nfs_read_data *rdata); /* write.c */ +extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags, + const struct nfs_pgio_completion_ops *compl_ops); +extern struct nfs_write_header *nfs_writehdr_alloc(void); +extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, - struct list_head *head); + struct nfs_pgio_header *hdr); extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags); + struct inode *inode, int ioflags, + const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_writedata_release(struct nfs_write_data *wdata); -extern void nfs_commit_free(struct nfs_write_data *p); -extern int nfs_initiate_write(struct nfs_write_data *data, - struct rpc_clnt *clnt, +extern void nfs_commit_free(struct nfs_commit_data *p); +extern int nfs_initiate_write(struct rpc_clnt *clnt, + struct nfs_write_data *data, const struct rpc_call_ops *call_ops, - int how); + int how, int flags); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); -extern int nfs_initiate_commit(struct nfs_write_data *data, - struct rpc_clnt *clnt, +extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct rpc_clnt *clnt, + struct nfs_commit_data *data, const struct rpc_call_ops *call_ops, - int how); -extern void nfs_init_commit(struct nfs_write_data *data, + int how, int flags); +extern void nfs_init_commit(struct nfs_commit_data *data, struct list_head *head, - struct pnfs_layout_segment *lseg); + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); +int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max); +int nfs_scan_commit(struct inode *inode, struct list_head *dst, + struct nfs_commit_info *cinfo); +void nfs_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); +int nfs_generic_commit_list(struct inode *inode, struct list_head *head, + int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, - struct pnfs_layout_segment *lseg); -void nfs_commit_clear_lock(struct nfs_inode *nfsi); -void nfs_commitdata_release(void *data); -void nfs_commit_release_pages(struct nfs_write_data *data); -void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head); -void nfs_request_remove_commit_list(struct nfs_page *req); + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); +void nfs_commitdata_release(struct nfs_commit_data *data); +void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo); +void nfs_request_remove_commit_list(struct nfs_page *req, + struct nfs_commit_info *cinfo); +void nfs_init_cinfo(struct nfs_commit_info *cinfo, + struct inode *inode, + struct nfs_direct_req *dreq); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, @@ -342,15 +366,16 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +/* direct.c */ +void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, + struct nfs_direct_req *dreq); + /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_read_data *); -extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); -extern int nfs4_init_client(struct nfs_client *clp, +extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr, - rpc_authflavor_t authflavour, - int noresvport); -extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); + rpc_authflavor_t authflavour); extern int _nfs4_call_sync(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, @@ -466,3 +491,15 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } +/* + * Convert a struct timespec into a 64-bit change attribute + * + * This does approximately the same thing as timespec_to_ns(), + * but for calculation efficiency, we multiply the seconds by + * 1024*1024*1024. + */ +static inline +u64 nfs_timespec_to_change_attr(const struct timespec *ts) +{ + return ((u64)ts->tv_sec << 30) + ts->tv_nsec; +} diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index d51868e..08b9c93 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -26,11 +26,6 @@ static LIST_HEAD(nfs_automount_list); static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); int nfs_mountpoint_expiry_timeout = 500 * HZ; -static struct vfsmount *nfs_do_submount(struct dentry *dentry, - struct nfs_fh *fh, - struct nfs_fattr *fattr, - rpc_authflavor_t authflavor); - /* * nfs_path - reconstruct the path given an arbitrary dentry * @base - used to return pointer to the end of devname part of path @@ -118,64 +113,6 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } -#ifdef CONFIG_NFS_V4 -rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) -{ - struct gss_api_mech *mech; - struct xdr_netobj oid; - int i; - rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; - - for (i = 0; i < flavors->num_flavors; i++) { - struct nfs4_secinfo_flavor *flavor; - flavor = &flavors->flavors[i]; - - if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { - pseudoflavor = flavor->flavor; - break; - } else if (flavor->flavor == RPC_AUTH_GSS) { - oid.len = flavor->gss.sec_oid4.len; - oid.data = flavor->gss.sec_oid4.data; - mech = gss_mech_get_by_OID(&oid); - if (!mech) - continue; - pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); - gss_mech_put(mech); - break; - } - } - - return pseudoflavor; -} - -static struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir, - struct qstr *name, - struct nfs_fh *fh, - struct nfs_fattr *fattr) -{ - int err; - - if (NFS_PROTO(dir)->version == 4) - return nfs4_proc_lookup_mountpoint(dir, name, fh, fattr); - - err = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, name, fh, fattr); - if (err) - return ERR_PTR(err); - return rpc_clone_client(NFS_SERVER(dir)->client); -} -#else /* CONFIG_NFS_V4 */ -static inline struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir, - struct qstr *name, - struct nfs_fh *fh, - struct nfs_fattr *fattr) -{ - int err = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, name, fh, fattr); - if (err) - return ERR_PTR(err); - return rpc_clone_client(NFS_SERVER(dir)->client); -} -#endif /* CONFIG_NFS_V4 */ - /* * nfs_d_automount - Handle crossing a mountpoint on the server * @path - The mountpoint @@ -191,10 +128,9 @@ static inline struct rpc_clnt *nfs_lookup_mountpoint(struct inode *dir, struct vfsmount *nfs_d_automount(struct path *path) { struct vfsmount *mnt; - struct dentry *parent; + struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; - struct rpc_clnt *client; dprintk("--> nfs_d_automount()\n"); @@ -210,21 +146,7 @@ struct vfsmount *nfs_d_automount(struct path *path) dprintk("%s: enter\n", __func__); - /* Look it up again to get its attributes */ - parent = dget_parent(path->dentry); - client = nfs_lookup_mountpoint(parent->d_inode, &path->dentry->d_name, fh, fattr); - dput(parent); - if (IS_ERR(client)) { - mnt = ERR_CAST(client); - goto out; - } - - if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) - mnt = nfs_do_refmount(client, path->dentry); - else - mnt = nfs_do_submount(path->dentry, fh, fattr, client->cl_auth->au_flavor); - rpc_shutdown_client(client); - + mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr); if (IS_ERR(mnt)) goto out; @@ -297,10 +219,8 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, * @authflavor - security flavor to use when performing the mount * */ -static struct vfsmount *nfs_do_submount(struct dentry *dentry, - struct nfs_fh *fh, - struct nfs_fattr *fattr, - rpc_authflavor_t authflavor) +struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, + struct nfs_fattr *fattr, rpc_authflavor_t authflavor) { struct nfs_clone_mount mountdata = { .sb = dentry->d_sb, @@ -333,3 +253,18 @@ out: dprintk("<-- nfs_do_submount() = %p\n", mnt); return mnt; } + +struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + int err; + struct dentry *parent = dget_parent(dentry); + + /* Look it up again to get its attributes */ + err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr); + dput(parent); + if (err != 0) + return ERR_PTR(err); + + return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor); +} diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index aa14ec3..8a6394e 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -1,3 +1,7 @@ +/* + * NFS-private data for each "struct net". Accessed with net_generic(). + */ + #ifndef __NFS_NETNS_H__ #define __NFS_NETNS_H__ @@ -20,6 +24,7 @@ struct nfs_net { struct idr cb_ident_idr; /* Protected by nfs_client_lock */ #endif spinlock_t nfs_client_lock; + struct timespec boot_time; }; extern int nfs_net_id; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 1f56000..baf759b 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -61,6 +61,7 @@ #define NFS_readdirres_sz (1) #define NFS_statfsres_sz (1+NFS_info_sz) +static int nfs_stat_to_errno(enum nfs_stat); /* * While encoding arguments, set up the reply buffer in advance to @@ -313,6 +314,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) p = xdr_decode_time(p, &fattr->atime); p = xdr_decode_time(p, &fattr->mtime); xdr_decode_time(p, &fattr->ctime); + fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); + return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -1109,7 +1112,7 @@ static const struct { * Returns a local errno value, or -EIO if the NFS status code is * not recognized. This function is used jointly by NFSv2 and NFSv3. */ -int nfs_stat_to_errno(enum nfs_stat status) +static int nfs_stat_to_errno(enum nfs_stat status) { int i; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 5242eae..2292a0f 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -142,7 +142,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, +nfs3_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs3_diropargs arg = { @@ -398,8 +398,7 @@ nfs3_proc_remove(struct inode *dir, struct qstr *name) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name.len = name->len, - .name.name = name->name, + .name = *name, }; struct nfs_removeres res; struct rpc_message msg = { @@ -811,11 +810,13 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) { - if (nfs3_async_handle_jukebox(task, data->inode)) + struct inode *inode = data->header->inode; + + if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; - nfs_invalidate_atime(data->inode); - nfs_refresh_inode(data->inode, &data->fattr); + nfs_invalidate_atime(inode); + nfs_refresh_inode(inode, &data->fattr); return 0; } @@ -831,10 +832,12 @@ static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_da static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) { - if (nfs3_async_handle_jukebox(task, data->inode)) + struct inode *inode = data->header->inode; + + if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); + nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); return 0; } @@ -848,7 +851,12 @@ static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_ rpc_call_start(task); } -static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) +static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ + rpc_call_start(task); +} + +static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) { if (nfs3_async_handle_jukebox(task, data->inode)) return -EAGAIN; @@ -856,7 +864,7 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } -static void nfs3_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; } @@ -876,6 +884,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .file_inode_ops = &nfs3_file_inode_operations, .file_ops = &nfs_file_operations, .getroot = nfs3_proc_get_root, + .submount = nfs_submount, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, .lookup = nfs3_proc_lookup, @@ -907,6 +916,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .write_rpc_prepare = nfs3_proc_write_rpc_prepare, .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, + .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, .commit_done = nfs3_commit_done, .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index a77cc9a..902de48 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -86,6 +86,8 @@ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) +static int nfs3_stat_to_errno(enum nfs_stat); + /* * Map file type to S_IFMT bits */ @@ -675,6 +677,7 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) p = xdr_decode_nfstime3(p, &fattr->atime); p = xdr_decode_nfstime3(p, &fattr->mtime); xdr_decode_nfstime3(p, &fattr->ctime); + fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); fattr->valid |= NFS_ATTR_FATTR_V3; return 0; @@ -725,12 +728,14 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) goto out_overflow; fattr->valid |= NFS_ATTR_FATTR_PRESIZE + | NFS_ATTR_FATTR_PRECHANGE | NFS_ATTR_FATTR_PREMTIME | NFS_ATTR_FATTR_PRECTIME; p = xdr_decode_size3(p, &fattr->pre_size); p = xdr_decode_nfstime3(p, &fattr->pre_mtime); xdr_decode_nfstime3(p, &fattr->pre_ctime); + fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime); return 0; out_overflow: @@ -1287,7 +1292,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, * }; */ static void encode_commit3args(struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_commitargs *args) { __be32 *p; @@ -1300,7 +1305,7 @@ static void encode_commit3args(struct xdr_stream *xdr, static void nfs3_xdr_enc_commit3args(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_commitargs *args) { encode_commit3args(xdr, args); } @@ -1385,7 +1390,7 @@ static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, out: return error; out_default: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1424,7 +1429,7 @@ static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1472,7 +1477,7 @@ out_default: error = decode_post_op_attr(xdr, result->dir_attr); if (unlikely(error)) goto out; - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1513,7 +1518,7 @@ static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, out: return error; out_default: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1554,7 +1559,7 @@ static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, out: return error; out_default: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1636,7 +1641,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1706,7 +1711,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1770,7 +1775,7 @@ out_default: error = decode_wcc_data(xdr, result->dir_attr); if (unlikely(error)) goto out; - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1809,7 +1814,7 @@ static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1853,7 +1858,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -1896,7 +1901,7 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /** @@ -2088,7 +2093,7 @@ out_default: error = decode_post_op_attr(xdr, result->dir_attr); if (unlikely(error)) goto out; - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -2156,7 +2161,7 @@ static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -2232,7 +2237,7 @@ static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -2295,7 +2300,7 @@ static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } /* @@ -2319,7 +2324,7 @@ out_status: */ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_commitres *result) { enum nfs_stat status; int error; @@ -2336,7 +2341,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, out: return error; out_status: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } #ifdef CONFIG_NFS_V3_ACL @@ -2401,7 +2406,7 @@ static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, out: return error; out_default: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, @@ -2420,11 +2425,76 @@ static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, out: return error; out_default: - return nfs_stat_to_errno(status); + return nfs3_stat_to_errno(status); } #endif /* CONFIG_NFS_V3_ACL */ + +/* + * We need to translate between nfs status return values and + * the local errno values which may not be the same. + */ +static const struct { + int stat; + int errno; +} nfs_errtbl[] = { + { NFS_OK, 0 }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, +#ifdef EWFLUSH + { NFSERR_WFLUSH, -EWFLUSH }, +#endif + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -EREMOTEIO }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } +}; + +/** + * nfs3_stat_to_errno - convert an NFS status code to a local errno + * @status: NFS status code to convert + * + * Returns a local errno value, or -EIO if the NFS status code is + * not recognized. This function is used jointly by NFSv2 and NFSv3. + */ +static int nfs3_stat_to_errno(enum nfs_stat status) +{ + int i; + + for (i = 0; nfs_errtbl[i].stat != -1; i++) { + if (nfs_errtbl[i].stat == (int)status) + return nfs_errtbl[i].errno; + } + dprintk("NFS: Unrecognized nfs status value: %u\n", status); + return nfs_errtbl[i].errno; +} + + #define PROC(proc, argtype, restype, timer) \ [NFS3PROC_##proc] = { \ .p_proc = NFS3PROC_##proc, \ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8d75021..c6827f93 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -24,6 +24,8 @@ enum nfs4_client_state { NFS4CLNT_RECALL_SLOT, NFS4CLNT_LEASE_CONFIRM, NFS4CLNT_SERVER_SCOPE_MISMATCH, + NFS4CLNT_PURGE_STATE, + NFS4CLNT_BIND_CONN_TO_SESSION, }; enum nfs4_session_state { @@ -52,11 +54,6 @@ struct nfs4_minor_version_ops { const struct nfs4_state_maintenance_ops *state_renewal_ops; }; -struct nfs_unique_id { - struct rb_node rb_node; - __u64 id; -}; - #define NFS_SEQID_CONFIRMED 1 struct nfs_seqid_counter { ktime_t create_time; @@ -206,12 +203,18 @@ extern const struct dentry_operations nfs4_dentry_operations; extern const struct inode_operations nfs4_dir_inode_operations; /* nfs4namespace.c */ +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); +struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); /* nfs4proc.c */ extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); +extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred); extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); +extern int nfs4_destroy_clientid(struct nfs_client *clp); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); @@ -239,8 +242,8 @@ extern int nfs41_setup_sequence(struct nfs4_session *session, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); -extern int nfs4_proc_create_session(struct nfs_client *); -extern int nfs4_proc_destroy_session(struct nfs4_session *); +extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); @@ -310,9 +313,9 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); #if defined(CONFIG_NFS_V4_1) struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); -extern void nfs4_schedule_session_recovery(struct nfs4_session *); +extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); #else -static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { } #endif /* CONFIG_NFS_V4_1 */ @@ -334,7 +337,7 @@ extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs41_handle_server_scope(struct nfs_client *, - struct server_scope **); + struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 5acfd9e..e134029 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -82,29 +82,76 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } +static void filelayout_reset_write(struct nfs_write_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct rpc_task *task = &data->task; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + data->task.tk_pid, + hdr->inode->i_sb->s_id, + (long long)NFS_FILEID(hdr->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops); + } +} + +static void filelayout_reset_read(struct nfs_read_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct rpc_task *task = &data->task; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + data->task.tk_pid, + hdr->inode->i_sb->s_id, + (long long)NFS_FILEID(hdr->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops); + } +} + static int filelayout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, - int *reset) + struct pnfs_layout_segment *lseg) { - struct nfs_server *mds_server = NFS_SERVER(state->inode); + struct inode *inode = lseg->pls_layout->plh_inode; + struct nfs_server *mds_server = NFS_SERVER(inode); + struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); struct nfs_client *mds_client = mds_server->nfs_client; + struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; if (task->tk_status >= 0) return 0; - *reset = 0; switch (task->tk_status) { /* MDS state errors */ case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: + if (state == NULL) + break; nfs_remove_bad_delegation(state->inode); case -NFS4ERR_OPENMODE: + if (state == NULL) + break; nfs4_schedule_stateid_recovery(mds_server, state); goto wait_on_recovery; case -NFS4ERR_EXPIRED: - nfs4_schedule_stateid_recovery(mds_server, state); + if (state != NULL) + nfs4_schedule_stateid_recovery(mds_server, state); nfs4_schedule_lease_recovery(mds_client); goto wait_on_recovery; /* DS session errors */ @@ -118,7 +165,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, dprintk("%s ERROR %d, Reset session. Exchangeid " "flags 0x%x\n", __func__, task->tk_status, clp->cl_exchange_flags); - nfs4_schedule_session_recovery(clp->cl_session); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); break; case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: @@ -127,11 +174,48 @@ static int filelayout_async_handle_error(struct rpc_task *task, break; case -NFS4ERR_RETRY_UNCACHED_REP: break; + /* Invalidate Layout errors */ + case -NFS4ERR_PNFS_NO_LAYOUT: + case -ESTALE: /* mapped NFS4ERR_STALE */ + case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ + case -EISDIR: /* mapped NFS4ERR_ISDIR */ + case -NFS4ERR_FHEXPIRED: + case -NFS4ERR_WRONG_TYPE: + dprintk("%s Invalid layout error %d\n", __func__, + task->tk_status); + /* + * Destroy layout so new i/o will get a new layout. + * Layout will not be destroyed until all current lseg + * references are put. Mark layout as invalid to resend failed + * i/o and all i/o waiting on the slot table to the MDS until + * layout is destroyed and a new valid layout is obtained. + */ + set_bit(NFS_LAYOUT_INVALID, + &NFS_I(inode)->layout->plh_flags); + pnfs_destroy_layout(NFS_I(inode)); + rpc_wake_up(&tbl->slot_tbl_waitq); + goto reset; + /* RPC connection errors */ + case -ECONNREFUSED: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EIO: + case -ETIMEDOUT: + case -EPIPE: + dprintk("%s DS connection error %d\n", __func__, + task->tk_status); + if (!filelayout_test_devid_invalid(devid)) + _pnfs_return_layout(inode); + filelayout_mark_devid_invalid(devid); + rpc_wake_up(&tbl->slot_tbl_waitq); + nfs4_ds_disconnect(clp); + /* fall through */ default: - dprintk("%s DS error. Retry through MDS %d\n", __func__, +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, task->tk_status); - *reset = 1; - break; + return -NFS4ERR_RESET_TO_MDS; } out: task->tk_status = 0; @@ -148,18 +232,17 @@ wait_on_recovery: static int filelayout_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { - int reset = 0; + struct nfs_pgio_header *hdr = data->header; + int err; - dprintk("%s DS read\n", __func__); + err = filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, hdr->lseg); - if (filelayout_async_handle_error(task, data->args.context->state, - data->ds_clp, &reset) == -EAGAIN) { - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", - __func__, data->ds_clp, data->ds_clp->cl_session); - if (reset) { - pnfs_set_lo_fail(data->lseg); - nfs4_reset_read(task, data); - } + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + filelayout_reset_read(data); + return task->tk_status; + case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; } @@ -175,13 +258,15 @@ static int filelayout_read_done_cb(struct rpc_task *task, static void filelayout_set_layoutcommit(struct nfs_write_data *wdata) { - if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds || + struct nfs_pgio_header *hdr = wdata->header; + + if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || wdata->res.verf->committed == NFS_FILE_SYNC) return; pnfs_set_layoutcommit(wdata); - dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, - (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb); + dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } /* @@ -191,8 +276,14 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) */ static void filelayout_read_prepare(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = (struct nfs_read_data *)data; + struct nfs_read_data *rdata = data; + if (filelayout_reset_to_mds(rdata->header->lseg)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + filelayout_reset_read(rdata); + rpc_exit(task, 0); + return; + } rdata->read_done_cb = filelayout_read_done_cb; if (nfs41_setup_sequence(rdata->ds_clp->cl_session, @@ -205,42 +296,47 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) static void filelayout_read_call_done(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = (struct nfs_read_data *)data; + struct nfs_read_data *rdata = data; dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && + task->tk_status == 0) + return; + /* Note this may cause RPC to be resent */ - rdata->mds_ops->rpc_call_done(task, data); + rdata->header->mds_ops->rpc_call_done(task, data); } static void filelayout_read_count_stats(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = (struct nfs_read_data *)data; + struct nfs_read_data *rdata = data; - rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics); + rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); } static void filelayout_read_release(void *data) { - struct nfs_read_data *rdata = (struct nfs_read_data *)data; + struct nfs_read_data *rdata = data; - put_lseg(rdata->lseg); - rdata->mds_ops->rpc_release(data); + nfs_put_client(rdata->ds_clp); + rdata->header->mds_ops->rpc_release(data); } static int filelayout_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) { - int reset = 0; - - if (filelayout_async_handle_error(task, data->args.context->state, - data->ds_clp, &reset) == -EAGAIN) { - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", - __func__, data->ds_clp, data->ds_clp->cl_session); - if (reset) { - pnfs_set_lo_fail(data->lseg); - nfs4_reset_write(task, data); - } + struct nfs_pgio_header *hdr = data->header; + int err; + + err = filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, hdr->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + filelayout_reset_write(data); + return task->tk_status; + case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; } @@ -250,7 +346,7 @@ static int filelayout_write_done_cb(struct rpc_task *task, } /* Fake up some data that will cause nfs_commit_release to retry the writes. */ -static void prepare_to_resend_writes(struct nfs_write_data *data) +static void prepare_to_resend_writes(struct nfs_commit_data *data) { struct nfs_page *first = nfs_list_entry(data->pages.next); @@ -261,19 +357,19 @@ static void prepare_to_resend_writes(struct nfs_write_data *data) } static int filelayout_commit_done_cb(struct rpc_task *task, - struct nfs_write_data *data) + struct nfs_commit_data *data) { - int reset = 0; - - if (filelayout_async_handle_error(task, data->args.context->state, - data->ds_clp, &reset) == -EAGAIN) { - dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", - __func__, data->ds_clp, data->ds_clp->cl_session); - if (reset) { - prepare_to_resend_writes(data); - pnfs_set_lo_fail(data->lseg); - } else - rpc_restart_call_prepare(task); + int err; + + err = filelayout_async_handle_error(task, NULL, data->ds_clp, + data->lseg); + + switch (err) { + case -NFS4ERR_RESET_TO_MDS: + prepare_to_resend_writes(data); + return -EAGAIN; + case -EAGAIN: + rpc_restart_call_prepare(task); return -EAGAIN; } @@ -282,8 +378,14 @@ static int filelayout_commit_done_cb(struct rpc_task *task, static void filelayout_write_prepare(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_write_data *wdata = data; + if (filelayout_reset_to_mds(wdata->header->lseg)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + filelayout_reset_write(wdata); + rpc_exit(task, 0); + return; + } if (nfs41_setup_sequence(wdata->ds_clp->cl_session, &wdata->args.seq_args, &wdata->res.seq_res, task)) @@ -294,36 +396,66 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) static void filelayout_write_call_done(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_write_data *wdata = data; + + if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && + task->tk_status == 0) + return; /* Note this may cause RPC to be resent */ - wdata->mds_ops->rpc_call_done(task, data); + wdata->header->mds_ops->rpc_call_done(task, data); } static void filelayout_write_count_stats(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_write_data *wdata = data; - rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics); + rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); } static void filelayout_write_release(void *data) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_write_data *wdata = data; + + nfs_put_client(wdata->ds_clp); + wdata->header->mds_ops->rpc_release(data); +} + +static void filelayout_commit_prepare(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; - put_lseg(wdata->lseg); - wdata->mds_ops->rpc_release(data); + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, &wdata->res.seq_res, + task)) + return; + + rpc_call_start(task); +} + +static void filelayout_write_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_commit_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + + rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); } -static void filelayout_commit_release(void *data) +static void filelayout_commit_release(void *calldata) { - struct nfs_write_data *wdata = (struct nfs_write_data *)data; + struct nfs_commit_data *data = calldata; - nfs_commit_release_pages(wdata); - if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding)) - nfs_commit_clear_lock(NFS_I(wdata->inode)); - put_lseg(wdata->lseg); - nfs_commitdata_release(wdata); + data->completion_ops->completion(data); + put_lseg(data->lseg); + nfs_put_client(data->ds_clp); + nfs_commitdata_release(data); } static const struct rpc_call_ops filelayout_read_call_ops = { @@ -341,16 +473,17 @@ static const struct rpc_call_ops filelayout_write_call_ops = { }; static const struct rpc_call_ops filelayout_commit_call_ops = { - .rpc_call_prepare = filelayout_write_prepare, - .rpc_call_done = filelayout_write_call_done, - .rpc_count_stats = filelayout_write_count_stats, + .rpc_call_prepare = filelayout_commit_prepare, + .rpc_call_done = filelayout_write_commit_done, + .rpc_count_stats = filelayout_commit_count_stats, .rpc_release = filelayout_commit_release, }; static enum pnfs_try_status filelayout_read_pagelist(struct nfs_read_data *data) { - struct pnfs_layout_segment *lseg = data->lseg; + struct nfs_pgio_header *hdr = data->header; + struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; loff_t offset = data->args.offset; u32 j, idx; @@ -358,25 +491,20 @@ filelayout_read_pagelist(struct nfs_read_data *data) int status; dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", - __func__, data->inode->i_ino, + __func__, hdr->inode->i_ino, data->args.pgbase, (size_t)data->args.count, offset); - if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) - return PNFS_NOT_ATTEMPTED; - /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); ds = nfs4_fl_prepare_ds(lseg, idx); - if (!ds) { - /* Either layout fh index faulty, or ds connect failed */ - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + if (!ds) return PNFS_NOT_ATTEMPTED; - } - dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr); + dprintk("%s USE DS: %s cl_count %d\n", __func__, + ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); /* No multipath support. Use first DS */ + atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) @@ -386,8 +514,8 @@ filelayout_read_pagelist(struct nfs_read_data *data) data->mds_offset = offset; /* Perform an asynchronous read to ds */ - status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, - &filelayout_read_call_ops); + status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, + &filelayout_read_call_ops, RPC_TASK_SOFTCONN); BUG_ON(status != 0); return PNFS_ATTEMPTED; } @@ -396,32 +524,26 @@ filelayout_read_pagelist(struct nfs_read_data *data) static enum pnfs_try_status filelayout_write_pagelist(struct nfs_write_data *data, int sync) { - struct pnfs_layout_segment *lseg = data->lseg; + struct nfs_pgio_header *hdr = data->header; + struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; loff_t offset = data->args.offset; u32 j, idx; struct nfs_fh *fh; int status; - if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) - return PNFS_NOT_ATTEMPTED; - /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); ds = nfs4_fl_prepare_ds(lseg, idx); - if (!ds) { - printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n", - __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + if (!ds) return PNFS_NOT_ATTEMPTED; - } - dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__, - data->inode->i_ino, sync, (size_t) data->args.count, offset, - ds->ds_remotestr); + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", + __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, + offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); data->write_done_cb = filelayout_write_done_cb; + atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) @@ -433,8 +555,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) data->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, - &filelayout_write_call_ops, sync); + status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, + &filelayout_write_call_ops, sync, + RPC_TASK_SOFTCONN); BUG_ON(status != 0); return PNFS_ATTEMPTED; } @@ -650,10 +773,65 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) dprintk("--> %s\n", __func__); nfs4_fl_put_deviceid(fl->dsaddr); - kfree(fl->commit_buckets); + /* This assumes a single RW lseg */ + if (lseg->pls_range.iomode == IOMODE_RW) { + struct nfs4_filelayout *flo; + + flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); + flo->commit_info.nbuckets = 0; + kfree(flo->commit_info.buckets); + flo->commit_info.buckets = NULL; + } _filelayout_free_lseg(fl); } +static int +filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + gfp_t gfp_flags) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + struct pnfs_commit_bucket *buckets; + int size; + + if (fl->commit_through_mds) + return 0; + if (cinfo->ds->nbuckets != 0) { + /* This assumes there is only one IOMODE_RW lseg. What + * we really want to do is have a layout_hdr level + * dictionary of <multipath_list4, fh> keys, each + * associated with a struct list_head, populated by calls + * to filelayout_write_pagelist(). + * */ + return 0; + } + + size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), + gfp_flags); + if (!buckets) + return -ENOMEM; + else { + int i; + + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets != 0) + kfree(buckets); + else { + cinfo->ds->buckets = buckets; + cinfo->ds->nbuckets = size; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + } + } + spin_unlock(cinfo->lock); + return 0; + } +} + static struct pnfs_layout_segment * filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, @@ -673,29 +851,6 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, _filelayout_free_lseg(fl); return NULL; } - - /* This assumes there is only one IOMODE_RW lseg. What - * we really want to do is have a layout_hdr level - * dictionary of <multipath_list4, fh> keys, each - * associated with a struct list_head, populated by calls - * to filelayout_write_pagelist(). - * */ - if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) { - int i; - int size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - - fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags); - if (!fl->commit_buckets) { - filelayout_free_lseg(&fl->generic_hdr); - return NULL; - } - fl->number_of_buckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&fl->commit_buckets[i].written); - INIT_LIST_HEAD(&fl->commit_buckets[i].committing); - } - } return &fl->generic_hdr; } @@ -716,8 +871,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, !nfs_generic_pg_test(pgio, prev, req)) return false; - p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; - r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; + p_stripe = (u64)req_offset(prev); + r_stripe = (u64)req_offset(req); stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; do_div(p_stripe, stripe_unit); @@ -732,6 +887,16 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, { BUG_ON(pgio->pg_lseg != NULL); + if (req->wb_offset != req->wb_pgbase) { + /* + * Handling unaligned pages is difficult, because have to + * somehow split a req in two in certain cases in the + * pg.test code. Avoid this by just not using pnfs + * in this case. + */ + nfs_pageio_reset_read_mds(pgio); + return; + } pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, @@ -747,8 +912,13 @@ static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { + struct nfs_commit_info cinfo; + int status; + BUG_ON(pgio->pg_lseg != NULL); + if (req->wb_offset != req->wb_pgbase) + goto out_mds; pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, @@ -757,7 +927,17 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, GFP_NOFS); /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) - nfs_pageio_reset_write_mds(pgio); + goto out_mds; + nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); + status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); + if (status < 0) { + put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + goto out_mds; + } + return; +out_mds: + nfs_pageio_reset_write_mds(pgio); } static const struct nfs_pageio_ops filelayout_pg_read_ops = { @@ -784,43 +964,42 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) * If this will make the bucket empty, it will need to put the lseg reference. */ static void -filelayout_clear_request_commit(struct nfs_page *req) +filelayout_clear_request_commit(struct nfs_page *req, + struct nfs_commit_info *cinfo) { struct pnfs_layout_segment *freeme = NULL; - struct inode *inode = req->wb_context->dentry->d_inode; - spin_lock(&inode->i_lock); + spin_lock(cinfo->lock); if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) goto out; + cinfo->ds->nwritten--; if (list_is_singular(&req->wb_list)) { - struct pnfs_layout_segment *lseg; + struct pnfs_commit_bucket *bucket; - /* From here we can find the bucket, but for the moment, - * since there is only one relevant lseg... - */ - list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { - if (lseg->pls_range.iomode == IOMODE_RW) { - freeme = lseg; - break; - } - } + bucket = list_first_entry(&req->wb_list, + struct pnfs_commit_bucket, + written); + freeme = bucket->wlseg; + bucket->wlseg = NULL; } out: - nfs_request_remove_commit_list(req); - spin_unlock(&inode->i_lock); + nfs_request_remove_commit_list(req, cinfo); + spin_unlock(cinfo->lock); put_lseg(freeme); } static struct list_head * filelayout_choose_commit_list(struct nfs_page *req, - struct pnfs_layout_segment *lseg) + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); u32 i, j; struct list_head *list; + struct pnfs_commit_bucket *buckets; if (fl->commit_through_mds) - return &NFS_I(req->wb_context->dentry->d_inode)->commit_list; + return &cinfo->mds->list; /* Note that we are calling nfs4_fl_calc_j_index on each page * that ends up being committed to a data server. An attractive @@ -828,31 +1007,33 @@ filelayout_choose_commit_list(struct nfs_page *req, * to store the value calculated in filelayout_write_pagelist * and just use that here. */ - j = nfs4_fl_calc_j_index(lseg, - (loff_t)req->wb_index << PAGE_CACHE_SHIFT); + j = nfs4_fl_calc_j_index(lseg, req_offset(req)); i = select_bucket_index(fl, j); - list = &fl->commit_buckets[i].written; + buckets = cinfo->ds->buckets; + list = &buckets[i].written; if (list_empty(list)) { /* Non-empty buckets hold a reference on the lseg. That ref * is normally transferred to the COMMIT call and released * there. It could also be released if the last req is pulled * off due to a rewrite, in which case it will be done in - * filelayout_remove_commit_req + * filelayout_clear_request_commit */ - get_lseg(lseg); + buckets[i].wlseg = get_lseg(lseg); } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); + cinfo->ds->nwritten++; return list; } static void filelayout_mark_request_commit(struct nfs_page *req, - struct pnfs_layout_segment *lseg) + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { struct list_head *list; - list = filelayout_choose_commit_list(req, lseg); - nfs_request_add_commit_list(req, list); + list = filelayout_choose_commit_list(req, lseg, cinfo); + nfs_request_add_commit_list(req, list, cinfo); } static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) @@ -880,7 +1061,7 @@ select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) return flseg->fh_array[i]; } -static int filelayout_initiate_commit(struct nfs_write_data *data, int how) +static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) { struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; @@ -890,135 +1071,138 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how) idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); ds = nfs4_fl_prepare_ds(lseg, idx); if (!ds) { - printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n", - __func__); - set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); - set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); prepare_to_resend_writes(data); filelayout_commit_release(data); return -EAGAIN; } - dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); - data->write_done_cb = filelayout_commit_done_cb; + dprintk("%s ino %lu, how %d cl_count %d\n", __func__, + data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); + data->commit_done_cb = filelayout_commit_done_cb; + atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; - return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient, - &filelayout_commit_call_ops, how); -} - -/* - * This is only useful while we are using whole file layouts. - */ -static struct pnfs_layout_segment * -find_only_write_lseg_locked(struct inode *inode) -{ - struct pnfs_layout_segment *lseg; - - list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) - if (lseg->pls_range.iomode == IOMODE_RW) - return lseg; - return NULL; -} - -static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) -{ - struct pnfs_layout_segment *rv; - - spin_lock(&inode->i_lock); - rv = find_only_write_lseg_locked(inode); - if (rv) - get_lseg(rv); - spin_unlock(&inode->i_lock); - return rv; + return nfs_initiate_commit(ds->ds_clp->cl_rpcclient, data, + &filelayout_commit_call_ops, how, + RPC_TASK_SOFTCONN); } static int -filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max, - spinlock_t *lock) +transfer_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max) { - struct list_head *src = &bucket->written; - struct list_head *dst = &bucket->committing; struct nfs_page *req, *tmp; int ret = 0; list_for_each_entry_safe(req, tmp, src, wb_list) { if (!nfs_lock_request(req)) continue; - if (cond_resched_lock(lock)) + kref_get(&req->wb_kref); + if (cond_resched_lock(cinfo->lock)) list_safe_reset_next(req, tmp, wb_list); - nfs_request_remove_commit_list(req); + nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); nfs_list_add_request(req, dst); ret++; - if (ret == max) + if ((ret == max) && !cinfo->dreq) break; } return ret; } +static int +filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo, + int max) +{ + struct list_head *src = &bucket->written; + struct list_head *dst = &bucket->committing; + int ret; + + ret = transfer_commit_list(src, dst, cinfo, max); + if (ret) { + cinfo->ds->nwritten -= ret; + cinfo->ds->ncommitting += ret; + bucket->clseg = bucket->wlseg; + if (list_empty(src)) + bucket->wlseg = NULL; + else + get_lseg(bucket->clseg); + } + return ret; +} + /* Move reqs from written to committing lists, returning count of number moved. - * Note called with i_lock held. + * Note called with cinfo->lock held. */ -static int filelayout_scan_commit_lists(struct inode *inode, int max, - spinlock_t *lock) +static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo, + int max) { - struct pnfs_layout_segment *lseg; - struct nfs4_filelayout_segment *fl; int i, rv = 0, cnt; - lseg = find_only_write_lseg_locked(inode); - if (!lseg) - goto out_done; - fl = FILELAYOUT_LSEG(lseg); - if (fl->commit_through_mds) - goto out_done; - for (i = 0; i < fl->number_of_buckets && max != 0; i++) { - cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i], - max, lock); + for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { + cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i], + cinfo, max); max -= cnt; rv += cnt; } -out_done: return rv; } +/* Pull everything off the committing lists and dump into @dst */ +static void filelayout_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + struct pnfs_commit_bucket *b; + int i; + + /* NOTE cinfo->lock is NOT held, relying on fact that this is + * only called on single thread per dreq. + * Can't take the lock because need to do put_lseg + */ + for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + if (transfer_commit_list(&b->written, dst, cinfo, 0)) { + BUG_ON(!list_empty(&b->written)); + put_lseg(b->wlseg); + b->wlseg = NULL; + } + } + cinfo->ds->nwritten = 0; +} + static unsigned int -alloc_ds_commits(struct inode *inode, struct list_head *list) +alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) { - struct pnfs_layout_segment *lseg; - struct nfs4_filelayout_segment *fl; - struct nfs_write_data *data; + struct pnfs_ds_commit_info *fl_cinfo; + struct pnfs_commit_bucket *bucket; + struct nfs_commit_data *data; int i, j; unsigned int nreq = 0; - /* Won't need this when non-whole file layout segments are supported - * instead we will use a pnfs_layout_hdr structure */ - lseg = find_only_write_lseg(inode); - if (!lseg) - return 0; - fl = FILELAYOUT_LSEG(lseg); - for (i = 0; i < fl->number_of_buckets; i++) { - if (list_empty(&fl->commit_buckets[i].committing)) + fl_cinfo = cinfo->ds; + bucket = fl_cinfo->buckets; + for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { + if (list_empty(&bucket->committing)) continue; data = nfs_commitdata_alloc(); if (!data) break; data->ds_commit_index = i; - data->lseg = lseg; + data->lseg = bucket->clseg; + bucket->clseg = NULL; list_add(&data->pages, list); nreq++; } /* Clean up on error */ - for (j = i; j < fl->number_of_buckets; j++) { - if (list_empty(&fl->commit_buckets[i].committing)) + for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) { + if (list_empty(&bucket->committing)) continue; - nfs_retry_commit(&fl->commit_buckets[i].committing, lseg); - put_lseg(lseg); /* associated with emptying bucket */ + nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); + put_lseg(bucket->clseg); + bucket->clseg = NULL; } - put_lseg(lseg); /* Caller will clean up entries put on list */ return nreq; } @@ -1026,9 +1210,9 @@ alloc_ds_commits(struct inode *inode, struct list_head *list) /* This follows nfs_commit_list pretty closely */ static int filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, - int how) + int how, struct nfs_commit_info *cinfo) { - struct nfs_write_data *data, *tmp; + struct nfs_commit_data *data, *tmp; LIST_HEAD(list); unsigned int nreq = 0; @@ -1039,30 +1223,34 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_add(&data->pages, &list); nreq++; } else - nfs_retry_commit(mds_pages, NULL); + nfs_retry_commit(mds_pages, NULL, cinfo); } - nreq += alloc_ds_commits(inode, &list); + nreq += alloc_ds_commits(cinfo, &list); if (nreq == 0) { - nfs_commit_clear_lock(NFS_I(inode)); + cinfo->completion_ops->error_cleanup(NFS_I(inode)); goto out; } - atomic_add(nreq, &NFS_I(inode)->commits_outstanding); + atomic_add(nreq, &cinfo->mds->rpcs_out); list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); if (!data->lseg) { - nfs_init_commit(data, mds_pages, NULL); - nfs_initiate_commit(data, NFS_CLIENT(inode), - data->mds_ops, how); + nfs_init_commit(data, mds_pages, NULL, cinfo); + nfs_initiate_commit(NFS_CLIENT(inode), data, + data->mds_ops, how, 0); } else { - nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg); + struct pnfs_commit_bucket *buckets; + + buckets = cinfo->ds->buckets; + nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo); filelayout_initiate_commit(data, how); } } out: + cinfo->ds->ncommitting = 0; return PNFS_ATTEMPTED; } @@ -1072,17 +1260,47 @@ filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); } +static struct pnfs_layout_hdr * +filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct nfs4_filelayout *flo; + + flo = kzalloc(sizeof(*flo), gfp_flags); + return &flo->generic_hdr; +} + +static void +filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + kfree(FILELAYOUT_FROM_HDR(lo)); +} + +static struct pnfs_ds_commit_info * +filelayout_get_ds_info(struct inode *inode) +{ + struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; + + if (layout == NULL) + return NULL; + else + return &FILELAYOUT_FROM_HDR(layout)->commit_info; +} + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .alloc_layout_hdr = filelayout_alloc_layout_hdr, + .free_layout_hdr = filelayout_free_layout_hdr, .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, .pg_read_ops = &filelayout_pg_read_ops, .pg_write_ops = &filelayout_pg_write_ops, + .get_ds_info = &filelayout_get_ds_info, .mark_request_commit = filelayout_mark_request_commit, .clear_request_commit = filelayout_clear_request_commit, .scan_commit_lists = filelayout_scan_commit_lists, + .recover_commit_reqs = filelayout_recover_commit_reqs, .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 21190bb..43fe802 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -33,6 +33,13 @@ #include "pnfs.h" /* + * Default data server connection timeout and retrans vaules. + * Set by module paramters dataserver_timeo and dataserver_retrans. + */ +#define NFS4_DEF_DS_TIMEO 60 +#define NFS4_DEF_DS_RETRANS 5 + +/* * Field testing shows we need to support up to 4096 stripe indices. * We store each index as a u8 (u32 on the wire) to keep the memory footprint * reasonable. This in turn means we support a maximum of 256 @@ -41,6 +48,9 @@ #define NFS4_PNFS_MAX_STRIPE_CNT 4096 #define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ +/* error codes for internal use */ +#define NFS4ERR_RESET_TO_MDS 12001 + enum stripetype4 { STRIPE_SPARSE = 1, STRIPE_DENSE = 2 @@ -62,23 +72,14 @@ struct nfs4_pnfs_ds { atomic_t ds_count; }; -/* nfs4_file_layout_dsaddr flags */ -#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 - struct nfs4_file_layout_dsaddr { struct nfs4_deviceid_node id_node; - unsigned long flags; u32 stripe_count; u8 *stripe_indices; u32 ds_num; struct nfs4_pnfs_ds *ds_list[1]; }; -struct nfs4_fl_commit_bucket { - struct list_head written; - struct list_head committing; -}; - struct nfs4_filelayout_segment { struct pnfs_layout_segment generic_hdr; u32 stripe_type; @@ -89,10 +90,19 @@ struct nfs4_filelayout_segment { struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ unsigned int num_fh; struct nfs_fh **fh_array; - struct nfs4_fl_commit_bucket *commit_buckets; /* Sort commits to ds */ - int number_of_buckets; }; +struct nfs4_filelayout { + struct pnfs_layout_hdr generic_hdr; + struct pnfs_ds_commit_info commit_info; +}; + +static inline struct nfs4_filelayout * +FILELAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct nfs4_filelayout, generic_hdr); +} + static inline struct nfs4_filelayout_segment * FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) { @@ -107,6 +117,36 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; } +static inline void +filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node) +{ + u32 *p = (u32 *)&node->deviceid; + + printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n", + p[0], p[1], p[2], p[3]); + + set_bit(NFS_DEVICEID_INVALID, &node->flags); +} + +static inline bool +filelayout_test_layout_invalid(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_INVALID, &lo->plh_flags); +} + +static inline bool +filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) +{ + return test_bit(NFS_DEVICEID_INVALID, &node->flags); +} + +static inline bool +filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) +{ + return filelayout_test_devid_invalid(FILELAYOUT_DEVID_NODE(lseg)) || + filelayout_test_layout_invalid(lseg->pls_layout); +} + extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); @@ -119,5 +159,6 @@ extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); +void nfs4_ds_disconnect(struct nfs_client *clp); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index c9cff9a..a1fab8d 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -30,12 +30,16 @@ #include <linux/nfs_fs.h> #include <linux/vmalloc.h> +#include <linux/module.h> #include "internal.h" #include "nfs4filelayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD +static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; +static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; + /* * Data server cache * @@ -145,6 +149,28 @@ _data_server_lookup_locked(const struct list_head *dsaddrs) } /* + * Lookup DS by nfs_client pointer. Zero data server client pointer + */ +void nfs4_ds_disconnect(struct nfs_client *clp) +{ + struct nfs4_pnfs_ds *ds; + struct nfs_client *found = NULL; + + dprintk("%s clp %p\n", __func__, clp); + spin_lock(&nfs4_ds_cache_lock); + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + if (ds->ds_clp && ds->ds_clp == clp) { + found = ds->ds_clp; + ds->ds_clp = NULL; + } + spin_unlock(&nfs4_ds_cache_lock); + if (found) { + set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); + nfs_put_client(clp); + } +} + +/* * Create an rpc connection to the nfs4_pnfs_ds data server * Currently only supports IPv4 and IPv6 addresses */ @@ -165,8 +191,9 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) __func__, ds->ds_remotestr, da->da_remotestr); clp = nfs4_set_ds_client(mds_srv->nfs_client, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP); + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + dataserver_timeo, dataserver_retrans); if (!IS_ERR(clp)) break; } @@ -176,28 +203,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) goto out; } - if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) { - if (!is_ds_client(clp)) { - status = -ENODEV; - goto out_put; - } - ds->ds_clp = clp; - dprintk("%s [existing] server=%s\n", __func__, - ds->ds_remotestr); - goto out; - } - - /* - * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to - * be equal to the MDS lease. Renewal is scheduled in create_session. - */ - spin_lock(&mds_srv->nfs_client->cl_lock); - clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; - spin_unlock(&mds_srv->nfs_client->cl_lock); - clp->cl_last_renewal = jiffies; - - /* New nfs_client */ - status = nfs4_init_ds_session(clp); + status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); if (status) goto out_put; @@ -602,7 +608,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->net, + da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); @@ -791,48 +797,42 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) return flseg->fh_array[i]; } -static void -filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, - int err, const char *ds_remotestr) -{ - u32 *p = (u32 *)&dsaddr->id_node.deviceid; - - printk(KERN_ERR "NFS: data server %s connection error %d." - " Deviceid [%x%x%x%x] marked out of use.\n", - ds_remotestr, err, p[0], p[1], p[2], p[3]); - - spin_lock(&nfs4_ds_cache_lock); - dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; - spin_unlock(&nfs4_ds_cache_lock); -} - struct nfs4_pnfs_ds * nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) { struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; + struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); + + if (filelayout_test_devid_invalid(devid)) + return NULL; if (ds == NULL) { printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", __func__, ds_idx); - return NULL; + goto mark_dev_invalid; } if (!ds->ds_clp) { struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); int err; - if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) { - /* Already tried to connect, don't try again */ - dprintk("%s Deviceid marked out of use\n", __func__); - return NULL; - } err = nfs4_ds_connect(s, ds); - if (err) { - filelayout_mark_devid_negative(dsaddr, err, - ds->ds_remotestr); - return NULL; - } + if (err) + goto mark_dev_invalid; } return ds; + +mark_dev_invalid: + filelayout_mark_devid_invalid(devid); + return NULL; } + +module_param(dataserver_retrans, uint, 0644); +MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " + "retries a request before it attempts further " + " recovery action."); +module_param(dataserver_timeo, uint, 0644); +MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " + "NFSv4.1 client waits for a response from a " + " data server before it retries an NFS request."); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index a7f3ded..017b4b0 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -132,6 +132,35 @@ static size_t nfs_parse_server_name(char *string, size_t len, return ret; } +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +{ + struct gss_api_mech *mech; + struct xdr_netobj oid; + int i; + rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + + for (i = 0; i < flavors->num_flavors; i++) { + struct nfs4_secinfo_flavor *flavor; + flavor = &flavors->flavors[i]; + + if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { + pseudoflavor = flavor->flavor; + break; + } else if (flavor->flavor == RPC_AUTH_GSS) { + oid.len = flavor->gss.sec_oid4.len; + oid.data = flavor->gss.sec_oid4.data; + mech = gss_mech_get_by_OID(&oid); + if (!mech) + continue; + pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); + gss_mech_put(mech); + break; + } + } + + return pseudoflavor; +} + static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) { struct page *page; @@ -168,7 +197,7 @@ struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *ino rpc_authflavor_t flavor; flavor = nfs4_negotiate_security(inode, name); - if (flavor < 0) + if ((int)flavor < 0) return ERR_PTR(flavor); clone = rpc_clone_client(clnt); @@ -300,7 +329,7 @@ out: * @dentry - dentry of referral * */ -struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry) +static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry) { struct vfsmount *mnt = ERR_PTR(-ENOMEM); struct dentry *parent; @@ -341,3 +370,25 @@ out: dprintk("%s: done\n", __func__); return mnt; } + +struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + struct dentry *parent = dget_parent(dentry); + struct rpc_clnt *client; + struct vfsmount *mnt; + + /* Look it up again to get its attributes and sec flavor */ + client = nfs4_proc_lookup_mountpoint(parent->d_inode, &dentry->d_name, fh, fattr); + dput(parent); + if (IS_ERR(client)) + return ERR_CAST(client); + + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + mnt = nfs_do_refmount(client, dentry); + else + mnt = nfs_do_submount(dentry, fh, fattr, client->cl_auth->au_flavor); + + rpc_shutdown_client(client); + return mnt; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 99650aa..d48dbef 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -64,6 +64,7 @@ #include "iostat.h" #include "callback.h" #include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -80,6 +81,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); +static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -101,6 +103,8 @@ static int nfs4_map_errors(int err) case -NFS4ERR_BADOWNER: case -NFS4ERR_BADNAME: return -EINVAL; + case -NFS4ERR_SHARE_DENIED: + return -EACCES; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -304,7 +308,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); - nfs4_schedule_session_recovery(clp->cl_session); + nfs4_schedule_session_recovery(clp->cl_session, errorcode); exception->retry = 1; break; #endif /* defined(CONFIG_NFS_V4_1) */ @@ -772,7 +776,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) struct nfs_inode *nfsi = NFS_I(dir); spin_lock(&dir->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); dir->i_version = cinfo->after; @@ -788,7 +792,6 @@ struct nfs4_opendata { struct nfs4_string owner_name; struct nfs4_string group_name; struct nfs_fattr f_attr; - struct nfs_fattr dir_attr; struct dentry *dir; struct dentry *dentry; struct nfs4_state_owner *owner; @@ -804,12 +807,10 @@ struct nfs4_opendata { static void nfs4_init_opendata_res(struct nfs4_opendata *p) { p->o_res.f_attr = &p->f_attr; - p->o_res.dir_attr = &p->dir_attr; p->o_res.seqid = p->o_arg.seqid; p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; nfs_fattr_init(&p->f_attr); - nfs_fattr_init(&p->dir_attr); nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name); } @@ -843,7 +844,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.name = &dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; - p->o_arg.dir_bitmask = server->cache_consistency_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; if (attrs != NULL && attrs->ia_valid != 0) { __be32 verf[2]; @@ -1332,7 +1332,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_session_recovery(server->nfs_client->cl_session); + nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: @@ -1611,8 +1611,6 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr); - nfs_refresh_inode(dir, o_res->dir_attr); - if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { status = _nfs4_proc_open_confirm(data); if (status != 0) @@ -1645,11 +1643,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(server, &data->f_attr); - if (o_arg->open_flags & O_CREAT) { + if (o_arg->open_flags & O_CREAT) update_changeattr(dir, &o_res->cinfo); - nfs_post_op_update_inode(dir, o_res->dir_attr); - } else - nfs_refresh_inode(dir, o_res->dir_attr); if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { @@ -1789,7 +1784,14 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct /* * Returns a referenced nfs4_state */ -static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) +static int _nfs4_do_open(struct inode *dir, + struct dentry *dentry, + fmode_t fmode, + int flags, + struct iattr *sattr, + struct rpc_cred *cred, + struct nfs4_state **res, + struct nfs4_threshold **ctx_th) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; @@ -1814,6 +1816,11 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode if (opendata == NULL) goto err_put_state_owner; + if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { + opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); + if (!opendata->f_attr.mdsthreshold) + goto err_opendata_put; + } if (dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(dentry->d_inode, sp); @@ -1839,11 +1846,19 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode nfs_setattr_update_inode(state->inode, sattr); nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); } + + if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) + *ctx_th = opendata->f_attr.mdsthreshold; + else + kfree(opendata->f_attr.mdsthreshold); + opendata->f_attr.mdsthreshold = NULL; + nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); *res = state; return 0; err_opendata_put: + kfree(opendata->f_attr.mdsthreshold); nfs4_opendata_put(opendata); err_put_state_owner: nfs4_put_state_owner(sp); @@ -1853,14 +1868,21 @@ out_err: } -static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) +static struct nfs4_state *nfs4_do_open(struct inode *dir, + struct dentry *dentry, + fmode_t fmode, + int flags, + struct iattr *sattr, + struct rpc_cred *cred, + struct nfs4_threshold **ctx_th) { struct nfs4_exception exception = { }; struct nfs4_state *res; int status; do { - status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res); + status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, + &res, ctx_th); if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -2184,7 +2206,8 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags struct nfs4_state *state; /* Protect against concurrent sillydeletes */ - state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, ctx->cred); + state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, + ctx->cred, &ctx->mdsthreshold); if (IS_ERR(state)) return ERR_CAST(state); ctx->state = state; @@ -2354,8 +2377,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, /* * get the file handle for the "/" directory on the server */ -static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) +int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) { int minor_version = server->nfs_client->cl_minorversion; int status = nfs4_lookup_root(server, fhandle, info); @@ -2372,6 +2395,31 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, return nfs4_map_errors(status); } +static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, + struct nfs_fsinfo *info) +{ + int error; + struct nfs_fattr *fattr = info->fattr; + + error = nfs4_server_capabilities(server, mntfh); + if (error < 0) { + dprintk("nfs4_get_root: getcaps error = %d\n", -error); + return error; + } + + error = nfs4_proc_getattr(server, mntfh, fattr); + if (error < 0) { + dprintk("nfs4_get_root: getattr error = %d\n", -error); + return error; + } + + if (fattr->valid & NFS_ATTR_FATTR_FSID && + !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + + return error; +} + /* * Get locations and (maybe) other attributes of a referral. * Note that we'll actually follow the referral later when @@ -2578,7 +2626,7 @@ out: return err; } -static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, +static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int status; @@ -2761,7 +2809,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, fmode = ctx->mode; } sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, de, fmode, flags, sattr, cred); + state = nfs4_do_open(dir, de, fmode, flags, sattr, cred, NULL); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); @@ -2782,9 +2830,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs args = { .fh = NFS_FH(dir), - .name.len = name->len, - .name.name = name->name, - .bitmask = server->attr_bitmask, + .name = *name, }; struct nfs_removeres res = { .server = server, @@ -2794,19 +2840,11 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) .rpc_argp = &args, .rpc_resp = &res, }; - int status = -ENOMEM; - - res.dir_attr = nfs_alloc_fattr(); - if (res.dir_attr == NULL) - goto out; + int status; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); - if (status == 0) { + if (status == 0) update_changeattr(dir, &res.cinfo); - nfs_post_op_update_inode(dir, res.dir_attr); - } - nfs_free_fattr(res.dir_attr); -out: return status; } @@ -2828,7 +2866,6 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; - args->bitmask = server->cache_consistency_bitmask; res->server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); @@ -2853,7 +2890,6 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); - nfs_post_op_update_inode(dir, res->dir_attr); return 1; } @@ -2864,7 +2900,6 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) struct nfs_renameres *res = msg->rpc_resp; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; - arg->bitmask = server->attr_bitmask; res->server = server; nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1); } @@ -2890,9 +2925,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 0; update_changeattr(old_dir, &res->old_cinfo); - nfs_post_op_update_inode(old_dir, res->old_fattr); update_changeattr(new_dir, &res->new_cinfo); - nfs_post_op_update_inode(new_dir, res->new_fattr); return 1; } @@ -2905,7 +2938,6 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, .new_dir = NFS_FH(new_dir), .old_name = old_name, .new_name = new_name, - .bitmask = server->attr_bitmask, }; struct nfs_renameres res = { .server = server, @@ -2917,21 +2949,11 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, }; int status = -ENOMEM; - res.old_fattr = nfs_alloc_fattr(); - res.new_fattr = nfs_alloc_fattr(); - if (res.old_fattr == NULL || res.new_fattr == NULL) - goto out; - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(old_dir, &res.old_cinfo); - nfs_post_op_update_inode(old_dir, res.old_fattr); update_changeattr(new_dir, &res.new_cinfo); - nfs_post_op_update_inode(new_dir, res.new_fattr); } -out: - nfs_free_fattr(res.new_fattr); - nfs_free_fattr(res.old_fattr); return status; } @@ -2969,18 +2991,15 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * int status = -ENOMEM; res.fattr = nfs_alloc_fattr(); - res.dir_attr = nfs_alloc_fattr(); - if (res.fattr == NULL || res.dir_attr == NULL) + if (res.fattr == NULL) goto out; status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo); - nfs_post_op_update_inode(dir, res.dir_attr); nfs_post_op_update_inode(inode, res.fattr); } out: - nfs_free_fattr(res.dir_attr); nfs_free_fattr(res.fattr); return status; } @@ -3003,7 +3022,6 @@ struct nfs4_createdata { struct nfs4_create_res res; struct nfs_fh fh; struct nfs_fattr fattr; - struct nfs_fattr dir_fattr; }; static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, @@ -3027,9 +3045,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, data->res.server = server; data->res.fh = &data->fh; data->res.fattr = &data->fattr; - data->res.dir_fattr = &data->dir_fattr; nfs_fattr_init(data->res.fattr); - nfs_fattr_init(data->res.dir_fattr); } return data; } @@ -3040,7 +3056,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { update_changeattr(dir, &data->res.dir_cinfo); - nfs_post_op_update_inode(dir, data->res.dir_fattr); status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); } return status; @@ -3336,12 +3351,12 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, void __nfs4_read_done_cb(struct nfs_read_data *data) { - nfs_invalidate_atime(data->inode); + nfs_invalidate_atime(data->header->inode); } static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { - struct nfs_server *server = NFS_SERVER(data->inode); + struct nfs_server *server = NFS_SERVER(data->header->inode); if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { rpc_restart_call_prepare(task); @@ -3376,7 +3391,7 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) { - if (nfs4_setup_sequence(NFS_SERVER(data->inode), + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, &data->res.seq_res, task)) @@ -3384,25 +3399,9 @@ static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_da rpc_call_start(task); } -/* Reset the the nfs_read_data to send the read to the MDS. */ -void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data) -{ - dprintk("%s Reset task for i/o through\n", __func__); - put_lseg(data->lseg); - data->lseg = NULL; - /* offsets will differ in the dense stripe case */ - data->args.offset = data->mds_offset; - data->ds_clp = NULL; - data->args.fh = NFS_FH(data->inode); - data->read_done_cb = nfs4_read_done_cb; - task->tk_ops = data->mds_ops; - rpc_task_reset_client(task, NFS_CLIENT(data->inode)); -} -EXPORT_SYMBOL_GPL(nfs4_reset_read); - static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) { - struct inode *inode = data->inode; + struct inode *inode = data->header->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { rpc_restart_call_prepare(task); @@ -3410,7 +3409,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), data->timestamp); - nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); + nfs_post_op_update_inode_force_wcc(inode, &data->fattr); } return 0; } @@ -3423,32 +3422,30 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) nfs4_write_done_cb(task, data); } -/* Reset the the nfs_write_data to send the write to the MDS. */ -void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data) +static +bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) { - dprintk("%s Reset task for i/o through\n", __func__); - put_lseg(data->lseg); - data->lseg = NULL; - data->ds_clp = NULL; - data->write_done_cb = nfs4_write_done_cb; - data->args.fh = NFS_FH(data->inode); - data->args.bitmask = data->res.server->cache_consistency_bitmask; - data->args.offset = data->mds_offset; - data->res.fattr = &data->fattr; - task->tk_ops = data->mds_ops; - rpc_task_reset_client(task, NFS_CLIENT(data->inode)); + const struct nfs_pgio_header *hdr = data->header; + + /* Don't request attributes for pNFS or O_DIRECT writes */ + if (data->ds_clp != NULL || hdr->dreq != NULL) + return false; + /* Otherwise, request attributes if and only if we don't hold + * a delegation + */ + return nfs_have_delegation(hdr->inode, FMODE_READ) == 0; } -EXPORT_SYMBOL_GPL(nfs4_reset_write); static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) { - struct nfs_server *server = NFS_SERVER(data->inode); + struct nfs_server *server = NFS_SERVER(data->header->inode); - if (data->lseg) { + if (!nfs4_write_need_cache_consistency_data(data)) { data->args.bitmask = NULL; data->res.fattr = NULL; } else data->args.bitmask = server->cache_consistency_bitmask; + if (!data->write_done_cb) data->write_done_cb = nfs4_write_done_cb; data->res.server = server; @@ -3460,6 +3457,16 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) { + if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), + &data->args.seq_args, + &data->res.seq_res, + task)) + return; + rpc_call_start(task); +} + +static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ if (nfs4_setup_sequence(NFS_SERVER(data->inode), &data->args.seq_args, &data->res.seq_res, @@ -3468,7 +3475,7 @@ static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_ rpc_call_start(task); } -static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { struct inode *inode = data->inode; @@ -3476,28 +3483,22 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *dat rpc_restart_call_prepare(task); return -EAGAIN; } - nfs_refresh_inode(inode, data->res.fattr); return 0; } -static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data) { if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; - return data->write_done_cb(task, data); + return data->commit_done_cb(task, data); } -static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->inode); - if (data->lseg) { - data->args.bitmask = NULL; - data->res.fattr = NULL; - } else - data->args.bitmask = server->cache_consistency_bitmask; - if (!data->write_done_cb) - data->write_done_cb = nfs4_commit_done_cb; + if (data->commit_done_cb == NULL) + data->commit_done_cb = nfs4_commit_done_cb; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); @@ -3906,7 +3907,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); - nfs4_schedule_session_recovery(clp->cl_session); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); task->tk_status = 0; return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ @@ -3932,13 +3933,21 @@ wait_on_recovery: return -EAGAIN; } -static void nfs4_construct_boot_verifier(struct nfs_client *clp, - nfs4_verifier *bootverf) +static void nfs4_init_boot_verifier(const struct nfs_client *clp, + nfs4_verifier *bootverf) { __be32 verf[2]; - verf[0] = htonl((u32)clp->cl_boot_time.tv_sec); - verf[1] = htonl((u32)clp->cl_boot_time.tv_nsec); + if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { + /* An impossible timestamp guarantees this value + * will never match a generated boot time. */ + verf[0] = 0; + verf[1] = (__be32)(NSEC_PER_SEC + 1); + } else { + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + verf[0] = (__be32)nn->boot_time.tv_sec; + verf[1] = (__be32)nn->boot_time.tv_nsec; + } memcpy(bootverf->data, verf, sizeof(bootverf->data)); } @@ -3961,7 +3970,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, int loop = 0; int status; - nfs4_construct_boot_verifier(clp, &sc_verifier); + nfs4_init_boot_verifier(clp, &sc_verifier); for(;;) { rcu_read_lock(); @@ -4105,7 +4114,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; - data->args.bitmask = server->attr_bitmask; + data->args.bitmask = server->cache_consistency_bitmask; nfs_copy_fh(&data->fh, NFS_FH(inode)); nfs4_stateid_copy(&data->stateid, stateid); data->res.fattr = &data->fattr; @@ -4126,9 +4135,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co if (status != 0) goto out; status = data->rpc_status; - if (status != 0) - goto out; - nfs_refresh_inode(inode, &data->fattr); + if (status == 0) + nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + else + nfs_refresh_inode(inode, &data->fattr); out: rpc_put_task(task); return status; @@ -4838,7 +4848,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_session_recovery(server->nfs_client->cl_session); + nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); goto out; case -ERESTARTSYS: /* @@ -5080,7 +5090,8 @@ out_inval: } static bool -nfs41_same_server_scope(struct server_scope *a, struct server_scope *b) +nfs41_same_server_scope(struct nfs41_server_scope *a, + struct nfs41_server_scope *b) { if (a->server_scope_sz == b->server_scope_sz && memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) @@ -5090,6 +5101,61 @@ nfs41_same_server_scope(struct server_scope *a, struct server_scope *b) } /* + * nfs4_proc_bind_conn_to_session() + * + * The 4.1 client currently uses the same TCP connection for the + * fore and backchannel. + */ +int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) +{ + int status; + struct nfs41_bind_conn_to_session_res res; + struct rpc_message msg = { + .rpc_proc = + &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION], + .rpc_argp = clp, + .rpc_resp = &res, + .rpc_cred = cred, + }; + + dprintk("--> %s\n", __func__); + BUG_ON(clp == NULL); + + res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); + if (unlikely(res.session == NULL)) { + status = -ENOMEM; + goto out; + } + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + if (status == 0) { + if (memcmp(res.session->sess_id.data, + clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { + dprintk("NFS: %s: Session ID mismatch\n", __func__); + status = -EIO; + goto out_session; + } + if (res.dir != NFS4_CDFS4_BOTH) { + dprintk("NFS: %s: Unexpected direction from server\n", + __func__); + status = -EIO; + goto out_session; + } + if (res.use_conn_in_rdma_mode) { + dprintk("NFS: %s: Server returned RDMA mode = true\n", + __func__); + status = -EIO; + goto out_session; + } + } +out_session: + kfree(res.session); +out: + dprintk("<-- %s status= %d\n", __func__, status); + return status; +} + +/* * nfs4_proc_exchange_id() * * Since the clientid has expired, all compounds using sessions @@ -5106,7 +5172,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, }; struct nfs41_exchange_id_res res = { - .client = clp, + 0 }; int status; struct rpc_message msg = { @@ -5119,7 +5185,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) dprintk("--> %s\n", __func__); BUG_ON(clp == NULL); - nfs4_construct_boot_verifier(clp, &verifier); + nfs4_init_boot_verifier(clp, &verifier); args.id_len = scnprintf(args.id, sizeof(args.id), "%s/%s/%u", @@ -5127,59 +5193,135 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) clp->cl_rpcclient->cl_nodename, clp->cl_rpcclient->cl_auth->au_flavor); - res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); - if (unlikely(!res.server_scope)) { + res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), + GFP_NOFS); + if (unlikely(res.server_owner == NULL)) { status = -ENOMEM; goto out; } - res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL); - if (unlikely(!res.impl_id)) { + res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), + GFP_NOFS); + if (unlikely(res.server_scope == NULL)) { + status = -ENOMEM; + goto out_server_owner; + } + + res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); + if (unlikely(res.impl_id == NULL)) { status = -ENOMEM; goto out_server_scope; } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - if (!status) - status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); + if (status == 0) + status = nfs4_check_cl_exchange_flags(res.flags); + + if (status == 0) { + clp->cl_clientid = res.clientid; + clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); + if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) + clp->cl_seqid = res.seqid; + + kfree(clp->cl_serverowner); + clp->cl_serverowner = res.server_owner; + res.server_owner = NULL; - if (!status) { /* use the most recent implementation id */ - kfree(clp->impl_id); - clp->impl_id = res.impl_id; - } else - kfree(res.impl_id); + kfree(clp->cl_implid); + clp->cl_implid = res.impl_id; - if (!status) { - if (clp->server_scope && - !nfs41_same_server_scope(clp->server_scope, + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, res.server_scope)) { dprintk("%s: server_scope mismatch detected\n", __func__); set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); - kfree(clp->server_scope); - clp->server_scope = NULL; + kfree(clp->cl_serverscope); + clp->cl_serverscope = NULL; } - if (!clp->server_scope) { - clp->server_scope = res.server_scope; + if (clp->cl_serverscope == NULL) { + clp->cl_serverscope = res.server_scope; goto out; } - } + } else + kfree(res.impl_id); +out_server_owner: + kfree(res.server_owner); out_server_scope: kfree(res.server_scope); out: - if (clp->impl_id) + if (clp->cl_implid != NULL) dprintk("%s: Server Implementation ID: " "domain: %s, name: %s, date: %llu,%u\n", - __func__, clp->impl_id->domain, clp->impl_id->name, - clp->impl_id->date.seconds, - clp->impl_id->date.nseconds); + __func__, clp->cl_implid->domain, clp->cl_implid->name, + clp->cl_implid->date.seconds, + clp->cl_implid->date.nseconds); dprintk("<-- %s status= %d\n", __func__, status); return status; } +static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, + struct rpc_cred *cred) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID], + .rpc_argp = clp, + .rpc_cred = cred, + }; + int status; + + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + if (status) + pr_warn("NFS: Got error %d from the server %s on " + "DESTROY_CLIENTID.", status, clp->cl_hostname); + return status; +} + +static int nfs4_proc_destroy_clientid(struct nfs_client *clp, + struct rpc_cred *cred) +{ + unsigned int loop; + int ret; + + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { + ret = _nfs4_proc_destroy_clientid(clp, cred); + switch (ret) { + case -NFS4ERR_DELAY: + case -NFS4ERR_CLIENTID_BUSY: + ssleep(1); + break; + default: + return ret; + } + } + return 0; +} + +int nfs4_destroy_clientid(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int ret = 0; + + if (clp->cl_mvops->minor_version < 1) + goto out; + if (clp->cl_exchange_flags == 0) + goto out; + cred = nfs4_get_exchange_id_cred(clp); + ret = nfs4_proc_destroy_clientid(clp, cred); + if (cred) + put_rpccred(cred); + switch (ret) { + case 0: + case -NFS4ERR_STALE_CLIENTID: + clp->cl_exchange_flags = 0; + } +out: + return ret; +} + struct nfs4_get_lease_time_data { struct nfs4_get_lease_time_args *args; struct nfs4_get_lease_time_res *res; @@ -5400,8 +5542,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) void nfs4_destroy_session(struct nfs4_session *session) { struct rpc_xprt *xprt; + struct rpc_cred *cred; - nfs4_proc_destroy_session(session); + cred = nfs4_get_exchange_id_cred(session->clp); + nfs4_proc_destroy_session(session, cred); + if (cred) + put_rpccred(cred); rcu_read_lock(); xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); @@ -5511,7 +5657,8 @@ static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, return nfs4_verify_back_channel_attrs(args, session); } -static int _nfs4_proc_create_session(struct nfs_client *clp) +static int _nfs4_proc_create_session(struct nfs_client *clp, + struct rpc_cred *cred) { struct nfs4_session *session = clp->cl_session; struct nfs41_create_session_args args = { @@ -5525,6 +5672,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION], .rpc_argp = &args, .rpc_resp = &res, + .rpc_cred = cred, }; int status; @@ -5549,7 +5697,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp) * It is the responsibility of the caller to verify the session is * expired before calling this routine. */ -int nfs4_proc_create_session(struct nfs_client *clp) +int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred) { int status; unsigned *ptr; @@ -5557,7 +5705,7 @@ int nfs4_proc_create_session(struct nfs_client *clp) dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); - status = _nfs4_proc_create_session(clp); + status = _nfs4_proc_create_session(clp, cred); if (status) goto out; @@ -5579,10 +5727,15 @@ out: * Issue the over-the-wire RPC DESTROY_SESSION. * The caller must serialize access to this routine. */ -int nfs4_proc_destroy_session(struct nfs4_session *session) +int nfs4_proc_destroy_session(struct nfs4_session *session, + struct rpc_cred *cred) { + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION], + .rpc_argp = session, + .rpc_cred = cred, + }; int status = 0; - struct rpc_message msg; dprintk("--> nfs4_proc_destroy_session\n"); @@ -5590,10 +5743,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) if (session->clp->cl_cons_state != NFS_CS_READY) return status; - msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION]; - msg.rpc_argp = session; - msg.rpc_resp = NULL; - msg.rpc_cred = NULL; status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status) @@ -5605,53 +5754,79 @@ int nfs4_proc_destroy_session(struct nfs4_session *session) return status; } +/* + * With sessions, the client is not marked ready until after a + * successful EXCHANGE_ID and CREATE_SESSION. + * + * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate + * other versions of NFS can be tried. + */ +static int nfs41_check_session_ready(struct nfs_client *clp) +{ + int ret; + + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { + ret = nfs4_client_recover_expired_lease(clp); + if (ret) + return ret; + } + if (clp->cl_cons_state < NFS_CS_READY) + return -EPROTONOSUPPORT; + smp_rmb(); + return 0; +} + int nfs4_init_session(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs4_session *session; unsigned int rsize, wsize; - int ret; if (!nfs4_has_session(clp)) return 0; session = clp->cl_session; - if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) - return 0; + spin_lock(&clp->cl_lock); + if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { - rsize = server->rsize; - if (rsize == 0) - rsize = NFS_MAX_FILE_IO_SIZE; - wsize = server->wsize; - if (wsize == 0) - wsize = NFS_MAX_FILE_IO_SIZE; + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; + wsize = server->wsize; + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; - session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; - session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + } + spin_unlock(&clp->cl_lock); - ret = nfs4_recover_expired_lease(server); - if (!ret) - ret = nfs4_check_client_ready(clp); - return ret; + return nfs41_check_session_ready(clp); } -int nfs4_init_ds_session(struct nfs_client *clp) +int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time) { struct nfs4_session *session = clp->cl_session; int ret; - if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) - return 0; - - ret = nfs4_client_recover_expired_lease(clp); - if (!ret) - /* Test for the DS role */ - if (!is_ds_client(clp)) - ret = -ENODEV; - if (!ret) - ret = nfs4_check_client_ready(clp); - return ret; + spin_lock(&clp->cl_lock); + if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) { + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the + * DS lease to be equal to the MDS lease. + */ + clp->cl_lease_time = lease_time; + clp->cl_last_renewal = jiffies; + } + spin_unlock(&clp->cl_lock); + ret = nfs41_check_session_ready(clp); + if (ret) + return ret; + /* Test for the DS role */ + if (!is_ds_client(clp)) + return -ENODEV; + return 0; } EXPORT_SYMBOL_GPL(nfs4_init_ds_session); @@ -6558,6 +6733,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .file_inode_ops = &nfs4_file_inode_operations, .file_ops = &nfs4_file_operations, .getroot = nfs4_proc_get_root, + .submount = nfs4_submount, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, .lookup = nfs4_proc_lookup, @@ -6590,13 +6766,13 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .write_rpc_prepare = nfs4_proc_write_rpc_prepare, .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, + .commit_rpc_prepare = nfs4_proc_commit_rpc_prepare, .commit_done = nfs4_commit_done, .lock = nfs4_proc_lock, .clear_acl_cache = nfs4_zap_acl_attr, .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, .init_client = nfs4_init_client, - .secinfo = nfs4_proc_secinfo, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index dc484c0..6930bec 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -49,7 +49,7 @@ #include "nfs4_fs.h" #include "delegation.h" -#define NFSDBG_FACILITY NFSDBG_PROC +#define NFSDBG_FACILITY NFSDBG_STATE void nfs4_renew_state(struct work_struct *work) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7f0fcfc..c679b9e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -57,6 +57,8 @@ #include "internal.h" #include "pnfs.h" +#define NFSDBG_FACILITY NFSDBG_STATE + #define OPENOWNER_POOL_SIZE 8 const nfs4_stateid zero_stateid; @@ -254,7 +256,7 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) goto out; set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); do_confirm: - status = nfs4_proc_create_session(clp); + status = nfs4_proc_create_session(clp, cred); if (status != 0) goto out; clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); @@ -1106,6 +1108,8 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp) return; if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + dprintk("%s: scheduling lease recovery for server %s\n", __func__, + clp->cl_hostname); nfs4_schedule_state_manager(clp); } EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); @@ -1122,6 +1126,8 @@ static void nfs40_handle_cb_pathdown(struct nfs_client *clp) { set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); nfs_expire_all_delegations(clp); + dprintk("%s: handling CB_PATHDOWN recovery for server %s\n", __func__, + clp->cl_hostname); } void nfs4_schedule_path_down_recovery(struct nfs_client *clp) @@ -1158,6 +1164,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4 struct nfs_client *clp = server->nfs_client; nfs4_state_mark_reclaim_nograce(clp, state); + dprintk("%s: scheduling stateid recovery for server %s\n", __func__, + clp->cl_hostname); nfs4_schedule_state_manager(clp); } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); @@ -1491,19 +1499,25 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_DEADSESSION: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_SEQ_FALSE_RETRY: case -NFS4ERR_SEQ_MISORDERED: set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* Zero session reset errors */ break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; case -EKEYEXPIRED: /* Nothing we can do */ nfs4_warn_keyexpired(clp->cl_hostname); break; default: + dprintk("%s: failed to handle error %d for server %s\n", + __func__, error, clp->cl_hostname); return error; } + dprintk("%s: handled error %d for server %s\n", __func__, error, + clp->cl_hostname); return 0; } @@ -1572,34 +1586,82 @@ out: return nfs4_recovery_handle_error(clp, status); } +/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors + * on EXCHANGE_ID for v4.1 + */ +static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) +{ + switch (status) { + case -NFS4ERR_SEQ_MISORDERED: + if (test_and_set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) + return -ESERVERFAULT; + /* Lease confirmation error: retry after purging the lease */ + ssleep(1); + case -NFS4ERR_CLID_INUSE: + case -NFS4ERR_STALE_CLIENTID: + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + break; + case -EACCES: + if (clp->cl_machine_cred == NULL) + return -EACCES; + /* Handle case where the user hasn't set up machine creds */ + nfs4_clear_machine_cred(clp); + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EAGAIN: + ssleep(1); + break; + + case -NFS4ERR_MINOR_VERS_MISMATCH: + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) + nfs_mark_client_ready(clp, -EPROTONOSUPPORT); + dprintk("%s: exit with error %d for server %s\n", + __func__, -EPROTONOSUPPORT, clp->cl_hostname); + return -EPROTONOSUPPORT; + case -EKEYEXPIRED: + nfs4_warn_keyexpired(clp->cl_hostname); + case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery + * in nfs4_exchange_id */ + default: + dprintk("%s: exit with error %d for server %s\n", __func__, + status, clp->cl_hostname); + return status; + } + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + dprintk("%s: handled error %d for server %s\n", __func__, status, + clp->cl_hostname); + return 0; +} + static int nfs4_reclaim_lease(struct nfs_client *clp) { struct rpc_cred *cred; const struct nfs4_state_recovery_ops *ops = clp->cl_mvops->reboot_recovery_ops; - int status = -ENOENT; + int status; cred = ops->get_clid_cred(clp); - if (cred != NULL) { - status = ops->establish_clid(clp, cred); - put_rpccred(cred); - /* Handle case where the user hasn't set up machine creds */ - if (status == -EACCES && cred == clp->cl_machine_cred) { - nfs4_clear_machine_cred(clp); - status = -EAGAIN; - } - if (status == -NFS4ERR_MINOR_VERS_MISMATCH) - status = -EPROTONOSUPPORT; - } - return status; + if (cred == NULL) + return -ENOENT; + status = ops->establish_clid(clp, cred); + put_rpccred(cred); + if (status != 0) + return nfs4_handle_reclaim_lease_error(clp, status); + return 0; } #ifdef CONFIG_NFS_V4_1 -void nfs4_schedule_session_recovery(struct nfs4_session *session) +void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { struct nfs_client *clp = session->clp; - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + switch (err) { + default: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + } nfs4_schedule_lease_recovery(clp); } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); @@ -1607,14 +1669,19 @@ EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); void nfs41_handle_recall_slot(struct nfs_client *clp) { set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); + dprintk("%s: scheduling slot recall for server %s\n", __func__, + clp->cl_hostname); nfs4_schedule_state_manager(clp); } static void nfs4_reset_all_state(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { - clp->cl_boot_time = CURRENT_TIME; + set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); nfs4_state_start_reclaim_nograce(clp); + dprintk("%s: scheduling reset of all state for server %s!\n", + __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); } } @@ -1623,33 +1690,50 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { nfs4_state_start_reclaim_reboot(clp); + dprintk("%s: server %s rebooted!\n", __func__, + clp->cl_hostname); nfs4_schedule_state_manager(clp); } } static void nfs41_handle_state_revoked(struct nfs_client *clp) { - /* Temporary */ nfs4_reset_all_state(clp); + dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); } static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) { /* This will need to handle layouts too */ nfs_expire_all_delegations(clp); + dprintk("%s: Recallable state revoked on server %s!\n", __func__, + clp->cl_hostname); } -static void nfs41_handle_cb_path_down(struct nfs_client *clp) +static void nfs41_handle_backchannel_fault(struct nfs_client *clp) { nfs_expire_all_delegations(clp); if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) nfs4_schedule_state_manager(clp); + dprintk("%s: server %s declared a backchannel fault\n", __func__, + clp->cl_hostname); +} + +static void nfs41_handle_cb_path_down(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, + &clp->cl_state) == 0) + nfs4_schedule_state_manager(clp); } void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { if (!flags) return; + + dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n", + __func__, clp->cl_hostname, clp->cl_clientid, flags); + if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | @@ -1659,18 +1743,21 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) nfs41_handle_state_revoked(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); - if (flags & (SEQ4_STATUS_CB_PATH_DOWN | - SEQ4_STATUS_BACKCHANNEL_FAULT | - SEQ4_STATUS_CB_PATH_DOWN_SESSION)) + if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) + nfs41_handle_backchannel_fault(clp); + else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + SEQ4_STATUS_CB_PATH_DOWN_SESSION)) nfs41_handle_cb_path_down(clp); } static int nfs4_reset_session(struct nfs_client *clp) { + struct rpc_cred *cred; int status; nfs4_begin_drain_session(clp); - status = nfs4_proc_destroy_session(clp->cl_session); + cred = nfs4_get_exchange_id_cred(clp); + status = nfs4_proc_destroy_session(clp->cl_session, cred); if (status && status != -NFS4ERR_BADSESSION && status != -NFS4ERR_DEADSESSION) { status = nfs4_recovery_handle_error(clp, status); @@ -1678,19 +1765,26 @@ static int nfs4_reset_session(struct nfs_client *clp) } memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); - status = nfs4_proc_create_session(clp); + status = nfs4_proc_create_session(clp, cred); if (status) { - status = nfs4_recovery_handle_error(clp, status); + dprintk("%s: session reset failed with status %d for server %s!\n", + __func__, status, clp->cl_hostname); + status = nfs4_handle_reclaim_lease_error(clp, status); goto out; } clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* create_session negotiated new slot table */ clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); + clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + dprintk("%s: session reset was successful for server %s!\n", + __func__, clp->cl_hostname); /* Let the state manager reestablish state */ if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) nfs41_setup_state_renewal(clp); out: + if (cred) + put_rpccred(cred); return status; } @@ -1722,37 +1816,41 @@ static int nfs4_recall_slot(struct nfs_client *clp) return 0; } -#else /* CONFIG_NFS_V4_1 */ -static int nfs4_reset_session(struct nfs_client *clp) { return 0; } -static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } -static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } -#endif /* CONFIG_NFS_V4_1 */ - -/* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors - * on EXCHANGE_ID for v4.1 - */ -static void nfs4_set_lease_expired(struct nfs_client *clp, int status) +static int nfs4_bind_conn_to_session(struct nfs_client *clp) { - switch (status) { - case -NFS4ERR_CLID_INUSE: - case -NFS4ERR_STALE_CLIENTID: - clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + struct rpc_cred *cred; + int ret; + + nfs4_begin_drain_session(clp); + cred = nfs4_get_exchange_id_cred(clp); + ret = nfs4_proc_bind_conn_to_session(clp, cred); + if (cred) + put_rpccred(cred); + clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + switch (ret) { + case 0: + dprintk("%s: bind_conn_to_session was successful for server %s!\n", + __func__, clp->cl_hostname); break; case -NFS4ERR_DELAY: - case -ETIMEDOUT: - case -EAGAIN: ssleep(1); + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); break; - - case -EKEYEXPIRED: - nfs4_warn_keyexpired(clp->cl_hostname); - case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery - * in nfs4_exchange_id */ default: - return; + return nfs4_recovery_handle_error(clp, ret); } - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + return 0; } +#else /* CONFIG_NFS_V4_1 */ +static int nfs4_reset_session(struct nfs_client *clp) { return 0; } +static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } +static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } + +static int nfs4_bind_conn_to_session(struct nfs_client *clp) +{ + return 0; +} +#endif /* CONFIG_NFS_V4_1 */ static void nfs4_state_manager(struct nfs_client *clp) { @@ -1760,19 +1858,21 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Ensure exclusive access to NFSv4 state */ do { + if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { + status = nfs4_reclaim_lease(clp); + if (status < 0) + goto out_error; + clear_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + } + if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { /* We're going to have to re-establish a clientid */ status = nfs4_reclaim_lease(clp); - if (status) { - nfs4_set_lease_expired(clp, status); - if (test_bit(NFS4CLNT_LEASE_EXPIRED, - &clp->cl_state)) - continue; - if (clp->cl_cons_state == - NFS_CS_SESSION_INITING) - nfs_mark_client_ready(clp, status); + if (status < 0) goto out_error; - } + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + continue; clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, @@ -1803,6 +1903,15 @@ static void nfs4_state_manager(struct nfs_client *clp) goto out_error; } + /* Send BIND_CONN_TO_SESSION */ + if (test_and_clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, + &clp->cl_state) && nfs4_has_session(clp)) { + status = nfs4_bind_conn_to_session(clp); + if (status < 0) + goto out_error; + continue; + } + /* First recover reboot state... */ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { status = nfs4_do_reclaim(clp, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c54aae3..ee4a74d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -53,9 +53,11 @@ #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> + #include "nfs4_fs.h" #include "internal.h" #include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -99,9 +101,12 @@ static int nfs4_stat_to_errno(int); #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) #define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +/* We support only one layout type per file system */ +#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ - 3 + 3 + 3 + nfs4_owner_maxsz + nfs4_group_maxsz)) + 3 + 3 + 3 + nfs4_owner_maxsz + \ + nfs4_group_maxsz + decode_mdsthreshold_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ nfs4_fattr_value_maxsz) #define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) @@ -321,8 +326,20 @@ static int nfs4_stat_to_errno(int); 1 /* csr_flags */ + \ decode_channel_attrs_maxsz + \ decode_channel_attrs_maxsz) +#define encode_bind_conn_to_session_maxsz (op_encode_hdr_maxsz + \ + /* bctsa_sessid */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* bctsa_dir */ + \ + 1 /* bctsa_use_conn_in_rdma_mode */) +#define decode_bind_conn_to_session_maxsz (op_decode_hdr_maxsz + \ + /* bctsr_sessid */ \ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + \ + 1 /* bctsr_dir */ + \ + 1 /* bctsr_use_conn_in_rdma_mode */) #define encode_destroy_session_maxsz (op_encode_hdr_maxsz + 4) #define decode_destroy_session_maxsz (op_decode_hdr_maxsz) +#define encode_destroy_clientid_maxsz (op_encode_hdr_maxsz + 2) +#define decode_destroy_clientid_maxsz (op_decode_hdr_maxsz) #define encode_sequence_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 4) #define decode_sequence_maxsz (op_decode_hdr_maxsz + \ @@ -421,30 +438,22 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_commit_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_commit_maxsz + \ - encode_getattr_maxsz) + encode_commit_maxsz) #define NFS4_dec_commit_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_commit_maxsz + \ - decode_getattr_maxsz) + decode_commit_maxsz) #define NFS4_enc_open_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_savefh_maxsz + \ encode_open_maxsz + \ encode_getfh_maxsz + \ - encode_getattr_maxsz + \ - encode_restorefh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_savefh_maxsz + \ decode_open_maxsz + \ decode_getfh_maxsz + \ - decode_getattr_maxsz + \ - decode_restorefh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_open_confirm_sz \ (compound_encode_hdr_maxsz + \ @@ -595,47 +604,37 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_remove_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_remove_maxsz + \ - encode_getattr_maxsz) + encode_remove_maxsz) #define NFS4_dec_remove_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_remove_maxsz + \ - decode_getattr_maxsz) + decode_remove_maxsz) #define NFS4_enc_rename_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ - encode_rename_maxsz + \ - encode_getattr_maxsz + \ - encode_restorefh_maxsz + \ - encode_getattr_maxsz) + encode_rename_maxsz) #define NFS4_dec_rename_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ - decode_rename_maxsz + \ - decode_getattr_maxsz + \ - decode_restorefh_maxsz + \ - decode_getattr_maxsz) + decode_rename_maxsz) #define NFS4_enc_link_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ encode_link_maxsz + \ - decode_getattr_maxsz + \ encode_restorefh_maxsz + \ - decode_getattr_maxsz) + encode_getattr_maxsz) #define NFS4_dec_link_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ decode_link_maxsz + \ - decode_getattr_maxsz + \ decode_restorefh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \ @@ -653,20 +652,14 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_savefh_maxsz + \ encode_create_maxsz + \ encode_getfh_maxsz + \ - encode_getattr_maxsz + \ - encode_restorefh_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_create_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_savefh_maxsz + \ decode_create_maxsz + \ decode_getfh_maxsz + \ - decode_getattr_maxsz + \ - decode_restorefh_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_pathconf_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ @@ -738,6 +731,12 @@ static int nfs4_stat_to_errno(int); decode_putfh_maxsz + \ decode_secinfo_maxsz) #if defined(CONFIG_NFS_V4_1) +#define NFS4_enc_bind_conn_to_session_sz \ + (compound_encode_hdr_maxsz + \ + encode_bind_conn_to_session_maxsz) +#define NFS4_dec_bind_conn_to_session_sz \ + (compound_decode_hdr_maxsz + \ + decode_bind_conn_to_session_maxsz) #define NFS4_enc_exchange_id_sz \ (compound_encode_hdr_maxsz + \ encode_exchange_id_maxsz) @@ -754,6 +753,10 @@ static int nfs4_stat_to_errno(int); encode_destroy_session_maxsz) #define NFS4_dec_destroy_session_sz (compound_decode_hdr_maxsz + \ decode_destroy_session_maxsz) +#define NFS4_enc_destroy_clientid_sz (compound_encode_hdr_maxsz + \ + encode_destroy_clientid_maxsz) +#define NFS4_dec_destroy_clientid_sz (compound_decode_hdr_maxsz + \ + decode_destroy_clientid_maxsz) #define NFS4_enc_sequence_sz \ (compound_decode_hdr_maxsz + \ encode_sequence_maxsz) @@ -1103,7 +1106,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg encode_nfs4_stateid(xdr, arg->stateid); } -static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr) { __be32 *p; @@ -1194,6 +1197,16 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c bitmask[1] & nfs4_fattr_bitmap[1], hdr); } +static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, + struct compound_hdr *hdr) +{ + encode_getattr_three(xdr, + bitmask[0] & nfs4_fattr_bitmap[0], + bitmask[1] & nfs4_fattr_bitmap[1], + bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD, + hdr); +} + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { encode_getattr_three(xdr, @@ -1678,6 +1691,20 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru #if defined(CONFIG_NFS_V4_1) /* NFSv4.1 operations */ +static void encode_bind_conn_to_session(struct xdr_stream *xdr, + struct nfs4_session *session, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION, + decode_bind_conn_to_session_maxsz, hdr); + encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + p = xdr_reserve_space(xdr, 8); + *p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH); + *p = 0; /* use_conn_in_rdma_mode = False */ +} + static void encode_exchange_id(struct xdr_stream *xdr, struct nfs41_exchange_id_args *args, struct compound_hdr *hdr) @@ -1726,6 +1753,7 @@ static void encode_create_session(struct xdr_stream *xdr, char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; uint32_t len; struct nfs_client *clp = args->client; + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); u32 max_resp_sz_cached; /* @@ -1767,7 +1795,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ + *p++ = (__be32)nn->boot_time.tv_nsec; /* stamp */ p = xdr_encode_opaque(p, machine_name, len); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ @@ -1782,6 +1810,14 @@ static void encode_destroy_session(struct xdr_stream *xdr, encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); } +static void encode_destroy_clientid(struct xdr_stream *xdr, + uint64_t clientid, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_DESTROY_CLIENTID, decode_destroy_clientid_maxsz, hdr); + encode_uint64(xdr, clientid); +} + static void encode_reclaim_complete(struct xdr_stream *xdr, struct nfs41_reclaim_complete_args *args, struct compound_hdr *hdr) @@ -2064,7 +2100,6 @@ static void nfs4_xdr_enc_remove(struct rpc_rqst *req, struct xdr_stream *xdr, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_remove(xdr, &args->name, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2084,9 +2119,6 @@ static void nfs4_xdr_enc_rename(struct rpc_rqst *req, struct xdr_stream *xdr, encode_savefh(xdr, &hdr); encode_putfh(xdr, args->new_dir, &hdr); encode_rename(xdr, args->old_name, args->new_name, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); - encode_restorefh(xdr, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2106,7 +2138,6 @@ static void nfs4_xdr_enc_link(struct rpc_rqst *req, struct xdr_stream *xdr, encode_savefh(xdr, &hdr); encode_putfh(xdr, args->dir_fh, &hdr); encode_link(xdr, args->name, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); encode_restorefh(xdr, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); @@ -2125,12 +2156,9 @@ static void nfs4_xdr_enc_create(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->dir_fh, &hdr); - encode_savefh(xdr, &hdr); encode_create(xdr, args, &hdr); encode_getfh(xdr, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); - encode_restorefh(xdr, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2191,12 +2219,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_savefh(xdr, &hdr); encode_open(xdr, args, &hdr); encode_getfh(xdr, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); - encode_restorefh(xdr, &hdr); - encode_getfattr(xdr, args->dir_bitmask, &hdr); + encode_getfattr_open(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2448,7 +2473,7 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, * a COMMIT request */ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeargs *args) + struct nfs_commitargs *args) { struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -2458,8 +2483,6 @@ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_commit(xdr, args, &hdr); - if (args->bitmask) - encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2602,8 +2625,8 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_delegreturn(xdr, args->stateid, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); + encode_delegreturn(xdr, args->stateid, &hdr); encode_nops(&hdr); } @@ -2651,6 +2674,22 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, #if defined(CONFIG_NFS_V4_1) /* + * BIND_CONN_TO_SESSION request + */ +static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_client *clp) +{ + struct compound_hdr hdr = { + .minorversion = clp->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_bind_conn_to_session(xdr, clp->cl_session, &hdr); + encode_nops(&hdr); +} + +/* * EXCHANGE_ID request */ static void nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, @@ -2699,6 +2738,22 @@ static void nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, } /* + * a DESTROY_CLIENTID request + */ +static void nfs4_xdr_enc_destroy_clientid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs_client *clp) +{ + struct compound_hdr hdr = { + .minorversion = clp->cl_mvops->minor_version, + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_destroy_clientid(xdr, clp->cl_clientid, &hdr); + encode_nops(&hdr); +} + +/* * a SEQUENCE request */ static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -4102,7 +4157,7 @@ static int decode_verifier(struct xdr_stream *xdr, void *verifier) return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE); } -static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res) { int status; @@ -4220,6 +4275,110 @@ xdr_error: return status; } +static int decode_threshold_hint(struct xdr_stream *xdr, + uint32_t *bitmap, + uint64_t *res, + uint32_t hint_bit) +{ + __be32 *p; + + *res = 0; + if (likely(bitmap[0] & hint_bit)) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_first_threshold_item4(struct xdr_stream *xdr, + struct nfs4_threshold *res) +{ + __be32 *p, *savep; + uint32_t bitmap[3] = {0,}, attrlen; + int status; + + /* layout type */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + res->l_type = be32_to_cpup(p); + + /* thi_hintset bitmap */ + status = decode_attr_bitmap(xdr, bitmap); + if (status < 0) + goto xdr_error; + + /* thi_hintlist length */ + status = decode_attr_length(xdr, &attrlen, &savep); + if (status < 0) + goto xdr_error; + /* thi_hintlist */ + status = decode_threshold_hint(xdr, bitmap, &res->rd_sz, THRESHOLD_RD); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->wr_sz, THRESHOLD_WR); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->rd_io_sz, + THRESHOLD_RD_IO); + if (status < 0) + goto xdr_error; + status = decode_threshold_hint(xdr, bitmap, &res->wr_io_sz, + THRESHOLD_WR_IO); + if (status < 0) + goto xdr_error; + + status = verify_attr_len(xdr, savep, attrlen); + res->bm = bitmap[0]; + + dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", + __func__, res->bm, res->rd_sz, res->wr_sz, res->rd_io_sz, + res->wr_io_sz); +xdr_error: + dprintk("%s ret=%d!\n", __func__, status); + return status; +} + +/* + * Thresholds on pNFS direct I/O vrs MDS I/O + */ +static int decode_attr_mdsthreshold(struct xdr_stream *xdr, + uint32_t *bitmap, + struct nfs4_threshold *res) +{ + __be32 *p; + int status = 0; + uint32_t num; + + if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); + if (num == 0) + return 0; + if (num > 1) + printk(KERN_INFO "%s: Warning: Multiple pNFS layout " + "drivers per filesystem not supported\n", + __func__); + + status = decode_first_threshold_item4(xdr, res); + } + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fattr *fattr, struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, @@ -4326,6 +4485,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; + status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold); + if (status < 0) + goto xdr_error; + xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); return status; @@ -5156,7 +5319,6 @@ static int decode_exchange_id(struct xdr_stream *xdr, uint32_t dummy; char *dummy_str; int status; - struct nfs_client *clp = res->client; uint32_t impl_id_count; status = decode_op_hdr(xdr, OP_EXCHANGE_ID); @@ -5166,36 +5328,39 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - xdr_decode_hyper(p, &clp->cl_clientid); + xdr_decode_hyper(p, &res->clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; - clp->cl_seqid = be32_to_cpup(p++); - clp->cl_exchange_flags = be32_to_cpup(p++); + res->seqid = be32_to_cpup(p++); + res->flags = be32_to_cpup(p++); /* We ask for SP4_NONE */ dummy = be32_to_cpup(p); if (dummy != SP4_NONE) return -EIO; - /* Throw away minor_id */ + /* server_owner4.so_minor_id */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; + p = xdr_decode_hyper(p, &res->server_owner->minor_id); - /* Throw away Major id */ + /* server_owner4.so_major_id */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(res->server_owner->major_id, dummy_str, dummy); + res->server_owner->major_id_sz = dummy; - /* Save server_scope */ + /* server_scope4 */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; - if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) return -EIO; - memcpy(res->server_scope->server_scope, dummy_str, dummy); res->server_scope->server_scope_sz = dummy; @@ -5276,6 +5441,37 @@ static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); } +static int decode_bind_conn_to_session(struct xdr_stream *xdr, + struct nfs41_bind_conn_to_session_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION); + if (!status) + status = decode_sessionid(xdr, &res->session->sess_id); + if (unlikely(status)) + return status; + + /* dir flags, rdma mode bool */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + + res->dir = be32_to_cpup(p++); + if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH) + return -EIO; + if (be32_to_cpup(p) == 0) + res->use_conn_in_rdma_mode = false; + else + res->use_conn_in_rdma_mode = true; + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_create_session(struct xdr_stream *xdr, struct nfs41_create_session_res *res) { @@ -5312,6 +5508,11 @@ static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) return decode_op_hdr(xdr, OP_DESTROY_SESSION); } +static int decode_destroy_clientid(struct xdr_stream *xdr, void *dummy) +{ + return decode_op_hdr(xdr, OP_DESTROY_CLIENTID); +} + static int decode_reclaim_complete(struct xdr_stream *xdr, void *dummy) { return decode_op_hdr(xdr, OP_RECLAIM_COMPLETE); @@ -5800,9 +6001,6 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; status = decode_remove(xdr, &res->cinfo); - if (status) - goto out; - decode_getfattr(xdr, res->dir_attr, res->server); out: return status; } @@ -5832,15 +6030,6 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; status = decode_rename(xdr, &res->old_cinfo, &res->new_cinfo); - if (status) - goto out; - /* Current FH is target directory */ - if (decode_getfattr(xdr, res->new_fattr, res->server)) - goto out; - status = decode_restorefh(xdr); - if (status) - goto out; - decode_getfattr(xdr, res->old_fattr, res->server); out: return status; } @@ -5876,8 +6065,6 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(xdr, res->dir_attr, res->server)) - goto out; status = decode_restorefh(xdr); if (status) goto out; @@ -5904,21 +6091,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_savefh(xdr); - if (status) - goto out; status = decode_create(xdr, &res->dir_cinfo); if (status) goto out; status = decode_getfh(xdr, res->fh); if (status) goto out; - if (decode_getfattr(xdr, res->fattr, res->server)) - goto out; - status = decode_restorefh(xdr); - if (status) - goto out; - decode_getfattr(xdr, res->dir_fattr, res->server); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6075,19 +6254,12 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_savefh(xdr); - if (status) - goto out; status = decode_open(xdr, res); if (status) goto out; if (decode_getfh(xdr, &res->fh) != 0) goto out; - if (decode_getfattr(xdr, res->f_attr, res->server) != 0) - goto out; - if (decode_restorefh(xdr) != 0) - goto out; - decode_getfattr(xdr, res->dir_attr, res->server); + decode_getfattr(xdr, res->f_attr, res->server); out: return status; } @@ -6353,7 +6525,7 @@ out: * Decode COMMIT response */ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct nfs_writeres *res) + struct nfs_commitres *res) { struct compound_hdr hdr; int status; @@ -6368,10 +6540,6 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, if (status) goto out; status = decode_commit(xdr, res); - if (status) - goto out; - if (res->fattr) - decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6527,10 +6695,10 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, status = decode_putfh(xdr); if (status != 0) goto out; - status = decode_delegreturn(xdr); + status = decode_getfattr(xdr, res->fattr, res->server); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server); + status = decode_delegreturn(xdr); out: return status; } @@ -6591,6 +6759,22 @@ out: #if defined(CONFIG_NFS_V4_1) /* + * Decode BIND_CONN_TO_SESSION response + */ +static int nfs4_xdr_dec_bind_conn_to_session(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_bind_conn_to_session(xdr, res); + return status; +} + +/* * Decode EXCHANGE_ID response */ static int nfs4_xdr_dec_exchange_id(struct rpc_rqst *rqstp, @@ -6639,6 +6823,22 @@ static int nfs4_xdr_dec_destroy_session(struct rpc_rqst *rqstp, } /* + * Decode DESTROY_CLIENTID response + */ +static int nfs4_xdr_dec_destroy_clientid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (!status) + status = decode_destroy_clientid(xdr, res); + return status; +} + +/* * Decode SEQUENCE response */ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, @@ -7085,6 +7285,9 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), + PROC(BIND_CONN_TO_SESSION, + enc_bind_conn_to_session, dec_bind_conn_to_session), + PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 4bff4a3..b47277ba 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -211,7 +211,7 @@ static void copy_single_comp(struct ore_components *oc, unsigned c, memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); } -int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, +static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, struct objio_segment **pseg) { /* This is the in memory structure of the objio_segment @@ -440,11 +440,12 @@ static void _read_done(struct ore_io_state *ios, void *private) int objio_read_pagelist(struct nfs_read_data *rdata) { + struct nfs_pgio_header *hdr = rdata->header; struct objio_state *objios; int ret; - ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true, - rdata->lseg, rdata->args.pages, rdata->args.pgbase, + ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, + hdr->lseg, rdata->args.pages, rdata->args.pgbase, rdata->args.offset, rdata->args.count, rdata, GFP_KERNEL, &objios); if (unlikely(ret)) @@ -483,12 +484,12 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) { struct objio_state *objios = priv; struct nfs_write_data *wdata = objios->oir.rpcdata; + struct address_space *mapping = wdata->header->inode->i_mapping; pgoff_t index = offset / PAGE_SIZE; - struct page *page = find_get_page(wdata->inode->i_mapping, index); + struct page *page = find_get_page(mapping, index); if (!page) { - page = find_or_create_page(wdata->inode->i_mapping, - index, GFP_NOFS); + page = find_or_create_page(mapping, index, GFP_NOFS); if (unlikely(!page)) { dprintk("%s: grab_cache_page Failed index=0x%lx\n", __func__, index); @@ -518,11 +519,12 @@ static const struct _ore_r4w_op _r4w_op = { int objio_write_pagelist(struct nfs_write_data *wdata, int how) { + struct nfs_pgio_header *hdr = wdata->header; struct objio_state *objios; int ret; - ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false, - wdata->lseg, wdata->args.pages, wdata->args.pgbase, + ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, + hdr->lseg, wdata->args.pages, wdata->args.pgbase, wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, &objios); if (unlikely(ret)) diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 595c5fc..8746135 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -258,7 +258,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) if (status >= 0) rdata->res.count = status; else - rdata->pnfs_error = status; + rdata->header->pnfs_error = status; objlayout_iodone(oir); /* must not use oir after this point */ @@ -279,12 +279,14 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) enum pnfs_try_status objlayout_read_pagelist(struct nfs_read_data *rdata) { + struct nfs_pgio_header *hdr = rdata->header; + struct inode *inode = hdr->inode; loff_t offset = rdata->args.offset; size_t count = rdata->args.count; int err; loff_t eof; - eof = i_size_read(rdata->inode); + eof = i_size_read(inode); if (unlikely(offset + count > eof)) { if (offset >= eof) { err = 0; @@ -297,17 +299,17 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) } rdata->res.eof = (offset + count) >= eof; - _fix_verify_io_params(rdata->lseg, &rdata->args.pages, + _fix_verify_io_params(hdr->lseg, &rdata->args.pages, &rdata->args.pgbase, rdata->args.offset, rdata->args.count); dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", - __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); + __func__, inode->i_ino, offset, count, rdata->res.eof); err = objio_read_pagelist(rdata); out: if (unlikely(err)) { - rdata->pnfs_error = err; + hdr->pnfs_error = err; dprintk("%s: Returned Error %d\n", __func__, err); return PNFS_NOT_ATTEMPTED; } @@ -340,7 +342,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) wdata->res.count = status; wdata->verf.committed = oir->committed; } else { - wdata->pnfs_error = status; + wdata->header->pnfs_error = status; } objlayout_iodone(oir); /* must not use oir after this point */ @@ -363,15 +365,16 @@ enum pnfs_try_status objlayout_write_pagelist(struct nfs_write_data *wdata, int how) { + struct nfs_pgio_header *hdr = wdata->header; int err; - _fix_verify_io_params(wdata->lseg, &wdata->args.pages, + _fix_verify_io_params(hdr->lseg, &wdata->args.pages, &wdata->args.pgbase, wdata->args.offset, wdata->args.count); err = objio_write_pagelist(wdata, how); if (unlikely(err)) { - wdata->pnfs_error = err; + hdr->pnfs_error = err; dprintk("%s: Returned Error %d\n", __func__, err); return PNFS_NOT_ATTEMPTED; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d21fcea..aed913c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -26,6 +26,47 @@ static struct kmem_cache *nfs_page_cachep; +bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +{ + p->npages = pagecount; + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; + else { + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + if (!p->pagevec) + p->npages = 0; + } + return p->pagevec != NULL; +} + +void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, + void (*release)(struct nfs_pgio_header *hdr)) +{ + hdr->req = nfs_list_entry(desc->pg_list.next); + hdr->inode = desc->pg_inode; + hdr->cred = hdr->req->wb_context->cred; + hdr->io_start = req_offset(hdr->req); + hdr->good_bytes = desc->pg_count; + hdr->dreq = desc->pg_dreq; + hdr->release = release; + hdr->completion_ops = desc->pg_completion_ops; + if (hdr->completion_ops->init_hdr) + hdr->completion_ops->init_hdr(hdr); +} + +void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) +{ + spin_lock(&hdr->lock); + if (pos < hdr->io_start + hdr->good_bytes) { + set_bit(NFS_IOHDR_ERROR, &hdr->flags); + clear_bit(NFS_IOHDR_EOF, &hdr->flags); + hdr->good_bytes = pos - hdr->io_start; + hdr->error = error; + } + spin_unlock(&hdr->lock); +} + static inline struct nfs_page * nfs_page_alloc(void) { @@ -76,12 +117,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - atomic_set(&req->wb_complete, 0); req->wb_index = page->index; page_cache_get(page); - BUG_ON(PagePrivate(page)); - BUG_ON(!PageLocked(page)); - BUG_ON(page->mapping->host != inode); req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; @@ -104,6 +141,15 @@ void nfs_unlock_request(struct nfs_page *req) clear_bit(PG_BUSY, &req->wb_flags); smp_mb__after_clear_bit(); wake_up_bit(&req->wb_flags, PG_BUSY); +} + +/** + * nfs_unlock_and_release_request - Unlock request and release the nfs_page + * @req: + */ +void nfs_unlock_and_release_request(struct nfs_page *req) +{ + nfs_unlock_request(req); nfs_release_request(req); } @@ -203,6 +249,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test); void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, const struct nfs_pageio_ops *pg_ops, + const struct nfs_pgio_completion_ops *compl_ops, size_t bsize, int io_flags) { @@ -215,9 +262,11 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_recoalesce = 0; desc->pg_inode = inode; desc->pg_ops = pg_ops; + desc->pg_completion_ops = compl_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; + desc->pg_dreq = NULL; } /** @@ -241,12 +290,12 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, return false; if (req->wb_context->state != prev->wb_context->state) return false; - if (req->wb_index != (prev->wb_index + 1)) - return false; if (req->wb_pgbase != 0) return false; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return false; + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; return pgio->pg_ops->pg_test(pgio, prev, req); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 38512bc..b8323aa 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -395,6 +395,9 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, dprintk("%s:Begin lo %p\n", __func__, lo); if (list_empty(&lo->plh_segs)) { + /* Reset MDS Threshold I/O counters */ + NFS_I(lo->plh_inode)->write_io = 0; + NFS_I(lo->plh_inode)->read_io = 0; if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) put_layout_hdr_locked(lo); return 0; @@ -455,6 +458,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); } +EXPORT_SYMBOL_GPL(pnfs_destroy_layout); /* * Called by the state manger to remove all layouts established under an @@ -692,6 +696,7 @@ out: dprintk("<-- %s status: %d\n", __func__, status); return status; } +EXPORT_SYMBOL_GPL(_pnfs_return_layout); bool pnfs_roc(struct inode *ino) { @@ -931,6 +936,81 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, } /* + * Use mdsthreshold hints set at each OPEN to determine if I/O should go + * to the MDS or over pNFS + * + * The nfs_inode read_io and write_io fields are cumulative counters reset + * when there are no layout segments. Note that in pnfs_update_layout iomode + * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a + * WRITE request. + * + * A return of true means use MDS I/O. + * + * From rfc 5661: + * If a file's size is smaller than the file size threshold, data accesses + * SHOULD be sent to the metadata server. If an I/O request has a length that + * is below the I/O size threshold, the I/O SHOULD be sent to the metadata + * server. If both file size and I/O size are provided, the client SHOULD + * reach or exceed both thresholds before sending its read or write + * requests to the data server. + */ +static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, + struct inode *ino, int iomode) +{ + struct nfs4_threshold *t = ctx->mdsthreshold; + struct nfs_inode *nfsi = NFS_I(ino); + loff_t fsize = i_size_read(ino); + bool size = false, size_set = false, io = false, io_set = false, ret = false; + + if (t == NULL) + return ret; + + dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", + __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); + + switch (iomode) { + case IOMODE_READ: + if (t->bm & THRESHOLD_RD) { + dprintk("%s fsize %llu\n", __func__, fsize); + size_set = true; + if (fsize < t->rd_sz) + size = true; + } + if (t->bm & THRESHOLD_RD_IO) { + dprintk("%s nfsi->read_io %llu\n", __func__, + nfsi->read_io); + io_set = true; + if (nfsi->read_io < t->rd_io_sz) + io = true; + } + break; + case IOMODE_RW: + if (t->bm & THRESHOLD_WR) { + dprintk("%s fsize %llu\n", __func__, fsize); + size_set = true; + if (fsize < t->wr_sz) + size = true; + } + if (t->bm & THRESHOLD_WR_IO) { + dprintk("%s nfsi->write_io %llu\n", __func__, + nfsi->write_io); + io_set = true; + if (nfsi->write_io < t->wr_io_sz) + io = true; + } + break; + } + if (size_set && io_set) { + if (size && io) + ret = true; + } else if (size || io) + ret = true; + + dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret); + return ret; +} + +/* * Layout segment is retreived from the server if not cached. * The appropriate layout segment is referenced and returned to the caller. */ @@ -957,6 +1037,10 @@ pnfs_update_layout(struct inode *ino, if (!pnfs_enabled_sb(NFS_SERVER(ino))) return NULL; + + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) + return NULL; + spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { @@ -1082,6 +1166,10 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r { BUG_ON(pgio->pg_lseg != NULL); + if (req->wb_offset != req->wb_pgbase) { + nfs_pageio_reset_read_mds(pgio); + return; + } pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), @@ -1100,6 +1188,10 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page * { BUG_ON(pgio->pg_lseg != NULL); + if (req->wb_offset != req->wb_pgbase) { + nfs_pageio_reset_write_mds(pgio); + return; + } pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), @@ -1113,26 +1205,31 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page * EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); bool -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, + const struct nfs_pgio_completion_ops *compl_ops) { struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; if (ld == NULL) return false; - nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0); + nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, + server->rsize, 0); return true; } bool -pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, + int ioflags, + const struct nfs_pgio_completion_ops *compl_ops) { struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; if (ld == NULL) return false; - nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags); + nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, + server->wsize, ioflags); return true; } @@ -1162,13 +1259,15 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); -static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head) +int pnfs_write_done_resend_to_mds(struct inode *inode, + struct list_head *head, + const struct nfs_pgio_completion_ops *compl_ops) { struct nfs_pageio_descriptor pgio; LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE); + nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE, compl_ops); while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1188,30 +1287,37 @@ static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head * } return 0; } +EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); + +static void pnfs_ld_handle_write_error(struct nfs_write_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + + dprintk("pnfs write error = %d\n", hdr->pnfs_error); + if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); + pnfs_return_layout(hdr->inode); + } + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) + data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops); +} /* * Called by non rpc-based layout drivers */ void pnfs_ld_write_done(struct nfs_write_data *data) { - if (likely(!data->pnfs_error)) { + struct nfs_pgio_header *hdr = data->header; + + if (!hdr->pnfs_error) { pnfs_set_layoutcommit(data); - data->mds_ops->rpc_call_done(&data->task, data); - } else { - dprintk("pnfs write error = %d\n", data->pnfs_error); - if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & - PNFS_LAYOUTRET_ON_ERROR) { - /* Don't lo_commit on error, Server will needs to - * preform a file recovery. - */ - clear_bit(NFS_INO_LAYOUTCOMMIT, - &NFS_I(data->inode)->flags); - pnfs_return_layout(data->inode); - } - data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages); - } - put_lseg(data->lseg); - data->mds_ops->rpc_release(data); + hdr->mds_ops->rpc_call_done(&data->task, data); + } else + pnfs_ld_handle_write_error(data); + hdr->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); @@ -1219,12 +1325,13 @@ static void pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, struct nfs_write_data *data) { - list_splice_tail_init(&data->pages, &desc->pg_list); - if (data->req && list_empty(&data->req->wb_list)) - nfs_list_add_request(data->req, &desc->pg_list); - nfs_pageio_reset_write_mds(desc); - desc->pg_recoalesce = 1; - put_lseg(data->lseg); + struct nfs_pgio_header *hdr = data->header; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + list_splice_tail_init(&hdr->pages, &desc->pg_list); + nfs_pageio_reset_write_mds(desc); + desc->pg_recoalesce = 1; + } nfs_writedata_release(data); } @@ -1234,23 +1341,18 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, struct pnfs_layout_segment *lseg, int how) { - struct inode *inode = wdata->inode; + struct nfs_pgio_header *hdr = wdata->header; + struct inode *inode = hdr->inode; enum pnfs_try_status trypnfs; struct nfs_server *nfss = NFS_SERVER(inode); - wdata->mds_ops = call_ops; - wdata->lseg = get_lseg(lseg); + hdr->mds_ops = call_ops; dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, inode->i_ino, wdata->args.count, wdata->args.offset, how); - trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) { - put_lseg(wdata->lseg); - wdata->lseg = NULL; - } else + if (trypnfs != PNFS_NOT_ATTEMPTED) nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); - dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); return trypnfs; } @@ -1266,7 +1368,7 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he while (!list_empty(head)) { enum pnfs_try_status trypnfs; - data = list_entry(head->next, struct nfs_write_data, list); + data = list_first_entry(head, struct nfs_write_data, list); list_del_init(&data->list); trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); @@ -1276,43 +1378,82 @@ pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *he put_lseg(lseg); } +static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) +{ + put_lseg(hdr->lseg); + nfs_writehdr_free(hdr); +} + int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - LIST_HEAD(head); + struct nfs_write_header *whdr; + struct nfs_pgio_header *hdr; int ret; - ret = nfs_generic_flush(desc, &head); - if (ret != 0) { + whdr = nfs_writehdr_alloc(); + if (!whdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; - return ret; + return -ENOMEM; } - pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags); - return 0; + hdr = &whdr->header; + nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); + hdr->lseg = get_lseg(desc->pg_lseg); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_flush(desc, hdr); + if (ret != 0) { + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + } else + pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + return ret; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); -static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +int pnfs_read_done_resend_to_mds(struct inode *inode, + struct list_head *head, + const struct nfs_pgio_completion_ops *compl_ops) { struct nfs_pageio_descriptor pgio; + LIST_HEAD(failed); - put_lseg(data->lseg); - data->lseg = NULL; - dprintk("pnfs write error = %d\n", data->pnfs_error); - if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags & - PNFS_LAYOUTRET_ON_ERROR) - pnfs_return_layout(data->inode); - - nfs_pageio_init_read_mds(&pgio, data->inode); - - while (!list_empty(&data->pages)) { - struct nfs_page *req = nfs_list_entry(data->pages.next); + /* Resend all requests through the MDS */ + nfs_pageio_init_read_mds(&pgio, inode, compl_ops); + while (!list_empty(head)) { + struct nfs_page *req = nfs_list_entry(head->next); nfs_list_remove_request(req); - nfs_pageio_add_request(&pgio, req); + if (!nfs_pageio_add_request(&pgio, req)) + nfs_list_add_request(req, &failed); } nfs_pageio_complete(&pgio); + + if (!list_empty(&failed)) { + list_move(&failed, head); + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); + +static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + + dprintk("pnfs read error = %d\n", hdr->pnfs_error); + if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & + PNFS_LAYOUTRET_ON_ERROR) { + clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); + pnfs_return_layout(hdr->inode); + } + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) + data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, + &hdr->pages, + hdr->completion_ops); } /* @@ -1320,13 +1461,14 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) */ void pnfs_ld_read_done(struct nfs_read_data *data) { - if (likely(!data->pnfs_error)) { + struct nfs_pgio_header *hdr = data->header; + + if (likely(!hdr->pnfs_error)) { __nfs4_read_done_cb(data); - data->mds_ops->rpc_call_done(&data->task, data); + hdr->mds_ops->rpc_call_done(&data->task, data); } else pnfs_ld_handle_read_error(data); - put_lseg(data->lseg); - data->mds_ops->rpc_release(data); + hdr->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); @@ -1334,11 +1476,13 @@ static void pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, struct nfs_read_data *data) { - list_splice_tail_init(&data->pages, &desc->pg_list); - if (data->req && list_empty(&data->req->wb_list)) - nfs_list_add_request(data->req, &desc->pg_list); - nfs_pageio_reset_read_mds(desc); - desc->pg_recoalesce = 1; + struct nfs_pgio_header *hdr = data->header; + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + list_splice_tail_init(&hdr->pages, &desc->pg_list); + nfs_pageio_reset_read_mds(desc); + desc->pg_recoalesce = 1; + } nfs_readdata_release(data); } @@ -1350,23 +1494,19 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg) { - struct inode *inode = rdata->inode; + struct nfs_pgio_header *hdr = rdata->header; + struct inode *inode = hdr->inode; struct nfs_server *nfss = NFS_SERVER(inode); enum pnfs_try_status trypnfs; - rdata->mds_ops = call_ops; - rdata->lseg = get_lseg(lseg); + hdr->mds_ops = call_ops; dprintk("%s: Reading ino:%lu %u@%llu\n", __func__, inode->i_ino, rdata->args.count, rdata->args.offset); trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); - if (trypnfs == PNFS_NOT_ATTEMPTED) { - put_lseg(rdata->lseg); - rdata->lseg = NULL; - } else { + if (trypnfs != PNFS_NOT_ATTEMPTED) nfs_inc_stats(inode, NFSIOS_PNFS_READ); - } dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); return trypnfs; } @@ -1382,7 +1522,7 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea while (!list_empty(head)) { enum pnfs_try_status trypnfs; - data = list_entry(head->next, struct nfs_read_data, list); + data = list_first_entry(head, struct nfs_read_data, list); list_del_init(&data->list); trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); @@ -1392,20 +1532,40 @@ pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *hea put_lseg(lseg); } +static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) +{ + put_lseg(hdr->lseg); + nfs_readhdr_free(hdr); +} + int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - LIST_HEAD(head); + struct nfs_read_header *rhdr; + struct nfs_pgio_header *hdr; int ret; - ret = nfs_generic_pagein(desc, &head); - if (ret != 0) { + rhdr = nfs_readhdr_alloc(); + if (!rhdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + ret = -ENOMEM; put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; return ret; } - pnfs_do_multiple_reads(desc, &head); - return 0; + hdr = &rhdr->header; + nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); + hdr->lseg = get_lseg(desc->pg_lseg); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_pagein(desc, hdr); + if (ret != 0) { + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + } else + pnfs_do_multiple_reads(desc, &hdr->rpc_list); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + return ret; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); @@ -1438,30 +1598,32 @@ EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void pnfs_set_layoutcommit(struct nfs_write_data *wdata) { - struct nfs_inode *nfsi = NFS_I(wdata->inode); + struct nfs_pgio_header *hdr = wdata->header; + struct inode *inode = hdr->inode; + struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos = wdata->mds_offset + wdata->res.count; bool mark_as_dirty = false; - spin_lock(&nfsi->vfs_inode.i_lock); + spin_lock(&inode->i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", - __func__, wdata->inode->i_ino); + __func__, inode->i_ino); } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) { + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - get_lseg(wdata->lseg); + get_lseg(hdr->lseg); } if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&nfsi->vfs_inode.i_lock); + spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", - __func__, wdata->lseg, nfsi->layout->plh_lwb); + __func__, hdr->lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ if (mark_as_dirty) - mark_inode_dirty_sync(wdata->inode); + mark_inode_dirty_sync(inode); } EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); @@ -1550,3 +1712,15 @@ out_free: kfree(data); goto out; } + +struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) +{ + struct nfs4_threshold *thp; + + thp = kzalloc(sizeof(*thp), GFP_NOFS); + if (!thp) { + dprintk("%s mdsthreshold allocation failed\n", __func__); + return NULL; + } + return thp; +} diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 442ebf6..29fd23c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -63,6 +63,7 @@ enum { NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ + NFS_LAYOUT_INVALID, /* layout is being destroyed */ }; enum layoutdriver_policy_flags { @@ -94,11 +95,20 @@ struct pnfs_layoutdriver_type { const struct nfs_pageio_ops *pg_read_ops; const struct nfs_pageio_ops *pg_write_ops; + struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); void (*mark_request_commit) (struct nfs_page *req, - struct pnfs_layout_segment *lseg); - void (*clear_request_commit) (struct nfs_page *req); - int (*scan_commit_lists) (struct inode *inode, int max, spinlock_t *lock); - int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how); + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo); + void (*clear_request_commit) (struct nfs_page *req, + struct nfs_commit_info *cinfo); + int (*scan_commit_lists) (struct nfs_commit_info *cinfo, + int max); + void (*recover_commit_reqs) (struct list_head *list, + struct nfs_commit_info *cinfo); + int (*commit_pagelist)(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo); /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted @@ -168,8 +178,10 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); void get_layout_hdr(struct pnfs_layout_hdr *lo); void put_lseg(struct pnfs_layout_segment *lseg); -bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); -bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); +bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, + const struct nfs_pgio_completion_ops *); +bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, + int, const struct nfs_pgio_completion_ops *); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); @@ -211,6 +223,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, gfp_t gfp_flags); void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); +int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, + const struct nfs_pgio_completion_ops *compl_ops); +int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head, + const struct nfs_pgio_completion_ops *compl_ops); +struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); /* nfs4_deviceid_flags */ enum { @@ -261,49 +278,66 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss) } static inline int -pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, + struct nfs_commit_info *cinfo) { - if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags)) + if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0) return PNFS_NOT_ATTEMPTED; - return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how); + return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo); +} + +static inline struct pnfs_ds_commit_info * +pnfs_get_ds_info(struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld == NULL || ld->get_ds_info == NULL) + return NULL; + return ld->get_ds_info(inode); } static inline bool -pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { struct inode *inode = req->wb_context->dentry->d_inode; struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (lseg == NULL || ld->mark_request_commit == NULL) return false; - ld->mark_request_commit(req, lseg); + ld->mark_request_commit(req, lseg, cinfo); return true; } static inline bool -pnfs_clear_request_commit(struct nfs_page *req) +pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { struct inode *inode = req->wb_context->dentry->d_inode; struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (ld == NULL || ld->clear_request_commit == NULL) return false; - ld->clear_request_commit(req); + ld->clear_request_commit(req, cinfo); return true; } static inline int -pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock) +pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, + int max) { - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; - int ret; - - if (ld == NULL || ld->scan_commit_lists == NULL) + if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) return 0; - ret = ld->scan_commit_lists(inode, max, lock); - if (ret != 0) - set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); - return ret; + else + return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); +} + +static inline void +pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, + struct nfs_commit_info *cinfo) +{ + if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) + return; + NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); } /* Should the pNFS client commit and return the layout upon a setattr */ @@ -327,6 +361,14 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } +static inline bool +pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, + struct nfs_server *nfss) +{ + return (dst && src && src->bm != 0 && + nfss->pnfs_curr_ld->id == src->l_type); +} + #ifdef NFS_DEBUG void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); #else @@ -396,45 +438,74 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) +static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, + const struct nfs_pgio_completion_ops *compl_ops) { return false; } -static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) +static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, + const struct nfs_pgio_completion_ops *compl_ops) { return false; } static inline int -pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, + struct nfs_commit_info *cinfo) { return PNFS_NOT_ATTEMPTED; } +static inline struct pnfs_ds_commit_info * +pnfs_get_ds_info(struct inode *inode) +{ + return NULL; +} + static inline bool -pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { return false; } static inline bool -pnfs_clear_request_commit(struct nfs_page *req) +pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { return false; } static inline int -pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock) +pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, + int max) { return 0; } +static inline void +pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, + struct nfs_commit_info *cinfo) +{ +} + static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { return 0; } +static inline bool +pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, + struct nfs_server *nfss) +{ + return false; +} + +static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) +{ + return NULL; +} + #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b63b6f4..a706b6b 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -178,7 +178,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, +nfs_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_diropargs arg = { @@ -335,8 +335,7 @@ nfs_proc_remove(struct inode *dir, struct qstr *name) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name.len = name->len, - .name.name = name->name, + .name = *name, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], @@ -641,12 +640,14 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) { + struct inode *inode = data->header->inode; + if (nfs_async_handle_expired_key(task)) return -EAGAIN; - nfs_invalidate_atime(data->inode); + nfs_invalidate_atime(inode); if (task->tk_status >= 0) { - nfs_refresh_inode(data->inode, data->res.fattr); + nfs_refresh_inode(inode, data->res.fattr); /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ @@ -668,11 +669,13 @@ static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) { + struct inode *inode = data->header->inode; + if (nfs_async_handle_expired_key(task)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); + nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); return 0; } @@ -688,8 +691,13 @@ static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_d rpc_call_start(task); } +static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) +{ + BUG(); +} + static void -nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) +nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) { BUG(); } @@ -733,6 +741,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .file_inode_ops = &nfs_file_inode_operations, .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, + .submount = nfs_submount, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, .lookup = nfs_proc_lookup, @@ -764,6 +773,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .write_rpc_prepare = nfs_proc_write_rpc_prepare, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, + .commit_rpc_prepare = nfs_proc_commit_rpc_prepare, .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0a4be28..86ced78 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -30,43 +30,73 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE static const struct nfs_pageio_ops nfs_pageio_read_ops; -static const struct rpc_call_ops nfs_read_partial_ops; -static const struct rpc_call_ops nfs_read_full_ops; +static const struct rpc_call_ops nfs_read_common_ops; +static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; static struct kmem_cache *nfs_rdata_cachep; -struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) +struct nfs_read_header *nfs_readhdr_alloc(void) { - struct nfs_read_data *p; - - p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); - if (p) { - INIT_LIST_HEAD(&p->pages); - p->npages = pagecount; - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); - if (!p->pagevec) { - kmem_cache_free(nfs_rdata_cachep, p); - p = NULL; - } - } + struct nfs_read_header *rhdr; + + rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + if (rhdr) { + struct nfs_pgio_header *hdr = &rhdr->header; + + INIT_LIST_HEAD(&hdr->pages); + INIT_LIST_HEAD(&hdr->rpc_list); + spin_lock_init(&hdr->lock); + atomic_set(&hdr->refcnt, 0); + } + return rhdr; +} + +static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr, + unsigned int pagecount) +{ + struct nfs_read_data *data, *prealloc; + + prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data; + if (prealloc->header == NULL) + data = prealloc; + else + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + if (nfs_pgarray_set(&data->pages, pagecount)) { + data->header = hdr; + atomic_inc(&hdr->refcnt); + } else { + if (data != prealloc) + kfree(data); + data = NULL; } - return p; +out: + return data; } -void nfs_readdata_free(struct nfs_read_data *p) +void nfs_readhdr_free(struct nfs_pgio_header *hdr) { - if (p && (p->pagevec != &p->page_array[0])) - kfree(p->pagevec); - kmem_cache_free(nfs_rdata_cachep, p); + struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header); + + kmem_cache_free(nfs_rdata_cachep, rhdr); } void nfs_readdata_release(struct nfs_read_data *rdata) { + struct nfs_pgio_header *hdr = rdata->header; + struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header); + put_nfs_open_context(rdata->args.context); - nfs_readdata_free(rdata); + if (rdata->pages.pagevec != rdata->pages.page_array) + kfree(rdata->pages.pagevec); + if (rdata != &read_header->rpc_data) + kfree(rdata); + else + rdata->header = NULL; + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); } static @@ -78,39 +108,11 @@ int nfs_return_empty_page(struct page *page) return 0; } -static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) -{ - unsigned int remainder = data->args.count - data->res.count; - unsigned int base = data->args.pgbase + data->res.count; - unsigned int pglen; - struct page **pages; - - if (data->res.eof == 0 || remainder == 0) - return; - /* - * Note: "remainder" can never be negative, since we check for - * this in the XDR code. - */ - pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; - base &= ~PAGE_CACHE_MASK; - pglen = PAGE_CACHE_SIZE - base; - for (;;) { - if (remainder <= pglen) { - zero_user(*pages, base, remainder); - break; - } - zero_user(*pages, base, pglen); - pages++; - remainder -= pglen; - pglen = PAGE_CACHE_SIZE; - base = 0; - } -} - void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, - struct inode *inode) + struct inode *inode, + const struct nfs_pgio_completion_ops *compl_ops) { - nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, + nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops, NFS_SERVER(inode)->rsize, 0); } @@ -121,11 +123,12 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); -static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode) +void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode, + const struct nfs_pgio_completion_ops *compl_ops) { - if (!pnfs_pageio_init_read(pgio, inode)) - nfs_pageio_init_read_mds(pgio, inode); + if (!pnfs_pageio_init_read(pgio, inode, compl_ops)) + nfs_pageio_init_read_mds(pgio, inode, compl_ops); } int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, @@ -146,9 +149,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_pageio_init_read(&pgio, inode); + nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops); nfs_pageio_add_request(&pgio, new); nfs_pageio_complete(&pgio); + NFS_I(inode)->read_io += pgio.pg_bytes_written; return 0; } @@ -169,16 +173,49 @@ static void nfs_readpage_release(struct nfs_page *req) nfs_release_request(req); } -int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, - const struct rpc_call_ops *call_ops) +/* Note io was page aligned */ +static void nfs_read_completion(struct nfs_pgio_header *hdr) +{ + unsigned long bytes = 0; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out; + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct page *page = req->wb_page; + + if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { + if (bytes > hdr->good_bytes) + zero_user(page, 0, PAGE_SIZE); + else if (hdr->good_bytes - bytes < PAGE_SIZE) + zero_user_segment(page, + hdr->good_bytes & ~PAGE_MASK, + PAGE_SIZE); + } + bytes += req->wb_bytes; + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { + if (bytes <= hdr->good_bytes) + SetPageUptodate(page); + } else + SetPageUptodate(page); + nfs_list_remove_request(req); + nfs_readpage_release(req); + } +out: + hdr->release(hdr); +} + +int nfs_initiate_read(struct rpc_clnt *clnt, + struct nfs_read_data *data, + const struct rpc_call_ops *call_ops, int flags) { - struct inode *inode = data->inode; + struct inode *inode = data->header->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = data->cred, + .rpc_cred = data->header->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, @@ -187,7 +224,7 @@ int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | swap_flags, + .flags = RPC_TASK_ASYNC | swap_flags | flags, }; /* Set up the initial task struct. */ @@ -212,19 +249,15 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read); /* * Set up the NFS read request struct */ -static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +static void nfs_read_rpcsetup(struct nfs_read_data *data, unsigned int count, unsigned int offset) { - struct inode *inode = req->wb_context->dentry->d_inode; - - data->req = req; - data->inode = inode; - data->cred = req->wb_context->cred; + struct nfs_page *req = data->header->req; - data->args.fh = NFS_FH(inode); + data->args.fh = NFS_FH(data->header->inode); data->args.offset = req_offset(req) + offset; data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pagevec; + data->args.pages = data->pages.pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; @@ -238,9 +271,9 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, static int nfs_do_read(struct nfs_read_data *data, const struct rpc_call_ops *call_ops) { - struct inode *inode = data->args.context->dentry->d_inode; + struct inode *inode = data->header->inode; - return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0); } static int @@ -253,7 +286,7 @@ nfs_do_multiple_reads(struct list_head *head, while (!list_empty(head)) { int ret2; - data = list_entry(head->next, struct nfs_read_data, list); + data = list_first_entry(head, struct nfs_read_data, list); list_del_init(&data->list); ret2 = nfs_do_read(data, call_ops); @@ -275,6 +308,24 @@ nfs_async_read_error(struct list_head *head) } } +static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { + .error_cleanup = nfs_async_read_error, + .completion = nfs_read_completion, +}; + +static void nfs_pagein_error(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + set_bit(NFS_IOHDR_REDO, &hdr->flags); + while (!list_empty(&hdr->rpc_list)) { + struct nfs_read_data *data = list_first_entry(&hdr->rpc_list, + struct nfs_read_data, list); + list_del(&data->list); + nfs_readdata_release(data); + } + desc->pg_completion_ops->error_cleanup(&desc->pg_list); +} + /* * Generate multiple requests to fill a single page. * @@ -288,93 +339,95 @@ nfs_async_read_error(struct list_head *head) * won't see the new data until our attribute cache is updated. This is more * or less conventional NFS client behavior. */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res) +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) { - struct nfs_page *req = nfs_list_entry(desc->pg_list.next); + struct nfs_page *req = hdr->req; struct page *page = req->wb_page; struct nfs_read_data *data; size_t rsize = desc->pg_bsize, nbytes; unsigned int offset; - int requests = 0; - int ret = 0; - - nfs_list_remove_request(req); offset = 0; nbytes = desc->pg_count; do { size_t len = min(nbytes,rsize); - data = nfs_readdata_alloc(1); - if (!data) - goto out_bad; - data->pagevec[0] = page; - nfs_read_rpcsetup(req, data, len, offset); - list_add(&data->list, res); - requests++; + data = nfs_readdata_alloc(hdr, 1); + if (!data) { + nfs_pagein_error(desc, hdr); + return -ENOMEM; + } + data->pages.pagevec[0] = page; + nfs_read_rpcsetup(data, len, offset); + list_add(&data->list, &hdr->rpc_list); nbytes -= len; offset += len; - } while(nbytes != 0); - atomic_set(&req->wb_complete, requests); - desc->pg_rpc_callops = &nfs_read_partial_ops; - return ret; -out_bad: - while (!list_empty(res)) { - data = list_entry(res->next, struct nfs_read_data, list); - list_del(&data->list); - nfs_readdata_release(data); - } - nfs_readpage_release(req); - return -ENOMEM; + } while (nbytes != 0); + + nfs_list_remove_request(req); + nfs_list_add_request(req, &hdr->pages); + desc->pg_rpc_callops = &nfs_read_common_ops; + return 0; } -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res) +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) { struct nfs_page *req; struct page **pages; - struct nfs_read_data *data; + struct nfs_read_data *data; struct list_head *head = &desc->pg_list; - int ret = 0; - data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, - desc->pg_count)); + data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base, + desc->pg_count)); if (!data) { - nfs_async_read_error(head); - ret = -ENOMEM; - goto out; + nfs_pagein_error(desc, hdr); + return -ENOMEM; } - pages = data->pagevec; + pages = data->pages.pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - nfs_list_add_request(req, &data->pages); + nfs_list_add_request(req, &hdr->pages); *pages++ = req->wb_page; } - req = nfs_list_entry(data->pages.next); - nfs_read_rpcsetup(req, data, desc->pg_count, 0); - list_add(&data->list, res); - desc->pg_rpc_callops = &nfs_read_full_ops; -out: - return ret; + nfs_read_rpcsetup(data, desc->pg_count, 0); + list_add(&data->list, &hdr->rpc_list); + desc->pg_rpc_callops = &nfs_read_common_ops; + return 0; } -int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head) +int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) { if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_pagein_multi(desc, head); - return nfs_pagein_one(desc, head); + return nfs_pagein_multi(desc, hdr); + return nfs_pagein_one(desc, hdr); } static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - LIST_HEAD(head); + struct nfs_read_header *rhdr; + struct nfs_pgio_header *hdr; int ret; - ret = nfs_generic_pagein(desc, &head); + rhdr = nfs_readhdr_alloc(); + if (!rhdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; + } + hdr = &rhdr->header; + nfs_pgheader_init(desc, hdr, nfs_readhdr_free); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_pagein(desc, hdr); if (ret == 0) - ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops); + ret = nfs_do_multiple_reads(&hdr->rpc_list, + desc->pg_rpc_callops); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); return ret; } @@ -389,20 +442,21 @@ static const struct nfs_pageio_ops nfs_pageio_read_ops = { */ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) { + struct inode *inode = data->header->inode; int status; dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, task->tk_status); - status = NFS_PROTO(data->inode)->read_done(task, data); + status = NFS_PROTO(inode)->read_done(task, data); if (status != 0) return status; - nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, data->res.count); + nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count); if (task->tk_status == -ESTALE) { - set_bit(NFS_INO_STALE, &NFS_I(data->inode)->flags); - nfs_mark_for_revalidate(data->inode); + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_mark_for_revalidate(inode); } return 0; } @@ -412,15 +466,13 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data struct nfs_readargs *argp = &data->args; struct nfs_readres *resp = &data->res; - if (resp->eof || resp->count == argp->count) - return; - /* This is a short read! */ - nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); + nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ - if (resp->count == 0) + if (resp->count == 0) { + nfs_set_pgio_error(data->header, -EIO, argp->offset); return; - + } /* Yes, so retry the read at the end of the data */ data->mds_offset += resp->count; argp->offset += resp->count; @@ -429,114 +481,46 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data rpc_restart_call_prepare(task); } -/* - * Handle a read reply that fills part of a page. - */ -static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) +static void nfs_readpage_result_common(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - + struct nfs_pgio_header *hdr = data->header; + + /* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */ if (nfs_readpage_result(task, data) != 0) return; if (task->tk_status < 0) - return; - - nfs_readpage_truncate_uninitialised_page(data); - nfs_readpage_retry(task, data); + nfs_set_pgio_error(hdr, task->tk_status, data->args.offset); + else if (data->res.eof) { + loff_t bound; + + bound = data->args.offset + data->res.count; + spin_lock(&hdr->lock); + if (bound < hdr->io_start + hdr->good_bytes) { + set_bit(NFS_IOHDR_EOF, &hdr->flags); + clear_bit(NFS_IOHDR_ERROR, &hdr->flags); + hdr->good_bytes = bound - hdr->io_start; + } + spin_unlock(&hdr->lock); + } else if (data->res.count != data->args.count) + nfs_readpage_retry(task, data); } -static void nfs_readpage_release_partial(void *calldata) +static void nfs_readpage_release_common(void *calldata) { - struct nfs_read_data *data = calldata; - struct nfs_page *req = data->req; - struct page *page = req->wb_page; - int status = data->task.tk_status; - - if (status < 0) - set_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags); - - if (atomic_dec_and_test(&req->wb_complete)) { - if (!test_bit(PG_PARTIAL_READ_FAILED, &req->wb_flags)) - SetPageUptodate(page); - nfs_readpage_release(req); - } nfs_readdata_release(calldata); } void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - NFS_PROTO(data->inode)->read_rpc_prepare(task, data); -} - -static const struct rpc_call_ops nfs_read_partial_ops = { - .rpc_call_prepare = nfs_read_prepare, - .rpc_call_done = nfs_readpage_result_partial, - .rpc_release = nfs_readpage_release_partial, -}; - -static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) -{ - unsigned int count = data->res.count; - unsigned int base = data->args.pgbase; - struct page **pages; - - if (data->res.eof) - count = data->args.count; - if (unlikely(count == 0)) - return; - pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; - base &= ~PAGE_CACHE_MASK; - count += base; - for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) - SetPageUptodate(*pages); - if (count == 0) - return; - /* Was this a short read? */ - if (data->res.eof || data->res.count == data->args.count) - SetPageUptodate(*pages); -} - -/* - * This is the callback from RPC telling us whether a reply was - * received or some error occurred (timeout or socket shutdown). - */ -static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) -{ - struct nfs_read_data *data = calldata; - - if (nfs_readpage_result(task, data) != 0) - return; - if (task->tk_status < 0) - return; - /* - * Note: nfs_readpage_retry may change the values of - * data->args. In the multi-page case, we therefore need - * to ensure that we call nfs_readpage_set_pages_uptodate() - * first. - */ - nfs_readpage_truncate_uninitialised_page(data); - nfs_readpage_set_pages_uptodate(data); - nfs_readpage_retry(task, data); -} - -static void nfs_readpage_release_full(void *calldata) -{ - struct nfs_read_data *data = calldata; - - while (!list_empty(&data->pages)) { - struct nfs_page *req = nfs_list_entry(data->pages.next); - - nfs_list_remove_request(req); - nfs_readpage_release(req); - } - nfs_readdata_release(calldata); + NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); } -static const struct rpc_call_ops nfs_read_full_ops = { +static const struct rpc_call_ops nfs_read_common_ops = { .rpc_call_prepare = nfs_read_prepare, - .rpc_call_done = nfs_readpage_result_full, - .rpc_release = nfs_readpage_release_full, + .rpc_call_done = nfs_readpage_result_common, + .rpc_release = nfs_readpage_release_common, }; /* @@ -668,11 +652,12 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - nfs_pageio_init_read(&pgio, inode); + nfs_pageio_init_read(&pgio, inode, &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); nfs_pageio_complete(&pgio); + NFS_I(inode)->read_io += pgio.pg_bytes_written; npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: @@ -684,7 +669,7 @@ out: int __init nfs_init_readpagecache(void) { nfs_rdata_cachep = kmem_cache_create("nfs_read_data", - sizeof(struct nfs_read_data), + sizeof(struct nfs_read_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_rdata_cachep == NULL) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 4ac7fca..ff656c0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -66,6 +66,7 @@ #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS +#define NFS_TEXT_DATA 1 #ifdef CONFIG_NFS_V3 #define NFS_DEFAULT_VERSION 3 @@ -277,12 +278,22 @@ static match_table_t nfs_vers_tokens = { { Opt_vers_err, NULL } }; +struct nfs_mount_info { + void (*fill_super)(struct super_block *, struct nfs_mount_info *); + int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); + struct nfs_parsed_mount_data *parsed; + struct nfs_clone_mount *cloned; + struct nfs_fh *mntfh; +}; + static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct dentry *); static int nfs_show_devname(struct seq_file *, struct dentry *); static int nfs_show_path(struct seq_file *, struct dentry *); static int nfs_show_stats(struct seq_file *, struct dentry *); +static struct dentry *nfs_fs_mount_common(struct file_system_type *, + struct nfs_server *, int, const char *, struct nfs_mount_info *); static struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *); static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, @@ -323,12 +334,11 @@ static const struct super_operations nfs_sops = { }; #ifdef CONFIG_NFS_V4 -static int nfs4_validate_text_mount_data(void *options, +static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *); +static int nfs4_validate_mount_data(void *options, struct nfs_parsed_mount_data *args, const char *dev_name); static struct dentry *nfs4_try_mount(int flags, const char *dev_name, - struct nfs_parsed_mount_data *data); -static struct dentry *nfs4_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); + struct nfs_mount_info *mount_info); static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data); static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, @@ -342,7 +352,7 @@ static void nfs4_kill_super(struct super_block *sb); static struct file_system_type nfs4_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .mount = nfs4_mount, + .mount = nfs_fs_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -786,8 +796,8 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server) static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) { - if (nfss->nfs_client && nfss->nfs_client->impl_id) { - struct nfs41_impl_id *impl_id = nfss->nfs_client->impl_id; + if (nfss->nfs_client && nfss->nfs_client->cl_implid) { + struct nfs41_impl_id *impl_id = nfss->nfs_client->cl_implid; seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s'," "date='%llu,%u'", impl_id->name, impl_id->domain, @@ -938,7 +948,7 @@ static void nfs_umount_begin(struct super_block *sb) rpc_killall_tasks(rpc); } -static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) { struct nfs_parsed_mount_data *data; @@ -953,8 +963,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data->nfs_server.protocol = XPRT_TRANSPORT_TCP; data->auth_flavors[0] = RPC_AUTH_UNIX; data->auth_flavor_len = 1; - data->version = version; data->minorversion = 0; + data->need_mount = true; data->net = current->nsproxy->net_ns; security_init_mnt_opts(&data->lsm_opts); } @@ -1674,8 +1684,8 @@ static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, * Use the remote server's MOUNT service to request the NFS file handle * corresponding to the provided path. */ -static int nfs_try_mount(struct nfs_parsed_mount_data *args, - struct nfs_fh *root_fh) +static int nfs_request_mount(struct nfs_parsed_mount_data *args, + struct nfs_fh *root_fh) { rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; unsigned int server_authlist_len = ARRAY_SIZE(server_authlist); @@ -1738,6 +1748,26 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, return nfs_walk_authlist(args, &request); } +static struct dentry *nfs_try_mount(int flags, const char *dev_name, + struct nfs_mount_info *mount_info) +{ + int status; + struct nfs_server *server; + + if (mount_info->parsed->need_mount) { + status = nfs_request_mount(mount_info->parsed, mount_info->mntfh); + if (status) + return ERR_PTR(status); + } + + /* Get a volume representation */ + server = nfs_create_server(mount_info->parsed, mount_info->mntfh); + if (IS_ERR(server)) + return ERR_CAST(server); + + return nfs_fs_mount_common(&nfs_fs_type, server, flags, dev_name, mount_info); +} + /* * Split "dev_name" into "hostname:export_path". * @@ -1826,10 +1856,10 @@ out_path: * + breaking back: trying proto=udp after proto=tcp, v2 after v3, * mountproto=tcp after mountproto=udp, and so on */ -static int nfs_validate_mount_data(void *options, - struct nfs_parsed_mount_data *args, - struct nfs_fh *mntfh, - const char *dev_name) +static int nfs23_validate_mount_data(void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) { struct nfs_mount_data *data = (struct nfs_mount_data *)options; struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; @@ -1883,6 +1913,7 @@ static int nfs_validate_mount_data(void *options, args->acregmax = data->acregmax; args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; + args->need_mount = false; memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); @@ -1934,43 +1965,8 @@ static int nfs_validate_mount_data(void *options, } break; - default: { - int status; - - if (nfs_parse_mount_options((char *)options, args) == 0) - return -EINVAL; - - if (!nfs_verify_server_address(sap)) - goto out_no_address; - - if (args->version == 4) -#ifdef CONFIG_NFS_V4 - return nfs4_validate_text_mount_data(options, - args, dev_name); -#else - goto out_v4_not_compiled; -#endif - - nfs_set_port(sap, &args->nfs_server.port, 0); - - nfs_set_mount_transport_protocol(args); - - status = nfs_parse_devname(dev_name, - &args->nfs_server.hostname, - PAGE_SIZE, - &args->nfs_server.export_path, - NFS_MAXPATHLEN); - if (!status) - status = nfs_try_mount(args, mntfh); - - kfree(args->nfs_server.export_path); - args->nfs_server.export_path = NULL; - - if (status) - return status; - - break; - } + default: + return NFS_TEXT_DATA; } #ifndef CONFIG_NFS_V3 @@ -1999,12 +1995,6 @@ out_v3_not_compiled: return -EPROTONOSUPPORT; #endif /* !CONFIG_NFS_V3 */ -#ifndef CONFIG_NFS_V4 -out_v4_not_compiled: - dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); - return -EPROTONOSUPPORT; -#endif /* !CONFIG_NFS_V4 */ - out_nomem: dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); return -ENOMEM; @@ -2018,6 +2008,82 @@ out_invalid_fh: return -EINVAL; } +#ifdef CONFIG_NFS_V4 +static int nfs_validate_mount_data(struct file_system_type *fs_type, + void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) +{ + if (fs_type == &nfs_fs_type) + return nfs23_validate_mount_data(options, args, mntfh, dev_name); + return nfs4_validate_mount_data(options, args, dev_name); +} +#else +static int nfs_validate_mount_data(struct file_system_type *fs_type, + void *options, + struct nfs_parsed_mount_data *args, + struct nfs_fh *mntfh, + const char *dev_name) +{ + return nfs23_validate_mount_data(options, args, mntfh, dev_name); +} +#endif + +static int nfs_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + int port = 0; + int max_namelen = PAGE_SIZE; + int max_pathlen = NFS_MAXPATHLEN; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + if (nfs_parse_mount_options((char *)options, args) == 0) + return -EINVAL; + + if (!nfs_verify_server_address(sap)) + goto out_no_address; + + if (args->version == 4) { +#ifdef CONFIG_NFS_V4 + port = NFS_PORT; + max_namelen = NFS4_MAXNAMLEN; + max_pathlen = NFS4_MAXPATHLEN; + nfs_validate_transport_protocol(args); + nfs4_validate_mount_flags(args); +#else + goto out_v4_not_compiled; +#endif /* CONFIG_NFS_V4 */ + } else + nfs_set_mount_transport_protocol(args); + + nfs_set_port(sap, &args->nfs_server.port, port); + + if (args->auth_flavor_len > 1) + goto out_bad_auth; + + return nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + max_namelen, + &args->nfs_server.export_path, + max_pathlen); + +#ifndef CONFIG_NFS_V4 +out_v4_not_compiled: + dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#endif /* !CONFIG_NFS_V4 */ + +out_no_address: + dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); + return -EINVAL; + +out_bad_auth: + dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n"); + return -EINVAL; +} + static int nfs_compare_remount_data(struct nfs_server *nfss, struct nfs_parsed_mount_data *data) @@ -2129,8 +2195,9 @@ static inline void nfs_initialise_sb(struct super_block *sb) * Finish setting up an NFS2/3 superblock */ static void nfs_fill_super(struct super_block *sb, - struct nfs_parsed_mount_data *data) + struct nfs_mount_info *mount_info) { + struct nfs_parsed_mount_data *data = mount_info->parsed; struct nfs_server *server = NFS_SB(sb); sb->s_blocksize_bits = 0; @@ -2154,8 +2221,9 @@ static void nfs_fill_super(struct super_block *sb, * Finish setting up a cloned NFS2/3 superblock */ static void nfs_clone_super(struct super_block *sb, - const struct super_block *old_sb) + struct nfs_mount_info *mount_info) { + const struct super_block *old_sb = mount_info->cloned->sb; struct nfs_server *server = NFS_SB(sb); sb->s_blocksize_bits = old_sb->s_blocksize_bits; @@ -2278,52 +2346,70 @@ static int nfs_compare_super(struct super_block *sb, void *data) return nfs_compare_mount_options(sb, server, mntflags); } +#ifdef CONFIG_NFS_FSCACHE +static void nfs_get_cache_cookie(struct super_block *sb, + struct nfs_parsed_mount_data *parsed, + struct nfs_clone_mount *cloned) +{ + char *uniq = NULL; + int ulen = 0; + + if (parsed && parsed->fscache_uniq) { + uniq = parsed->fscache_uniq; + ulen = strlen(parsed->fscache_uniq); + } else if (cloned) { + struct nfs_server *mnt_s = NFS_SB(cloned->sb); + if (mnt_s->fscache_key) { + uniq = mnt_s->fscache_key->key.uniquifier; + ulen = mnt_s->fscache_key->key.uniq_len; + }; + } + + nfs_fscache_get_super_cookie(sb, uniq, ulen); +} +#else +static void nfs_get_cache_cookie(struct super_block *sb, + struct nfs_parsed_mount_data *parsed, + struct nfs_clone_mount *cloned) +{ +} +#endif + static int nfs_bdi_register(struct nfs_server *server) { return bdi_register_dev(&server->backing_dev_info, server->s_dev); } -static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data) +static int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, + struct nfs_mount_info *mount_info) +{ + return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts); +} + +static int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, + struct nfs_mount_info *mount_info) +{ + /* clone any lsm security options from the parent to the new sb */ + security_sb_clone_mnt_opts(mount_info->cloned->sb, s); + if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) + return -ESTALE; + return 0; +} + +static struct dentry *nfs_fs_mount_common(struct file_system_type *fs_type, + struct nfs_server *server, + int flags, const char *dev_name, + struct nfs_mount_info *mount_info) { - struct nfs_server *server = NULL; struct super_block *s; - struct nfs_parsed_mount_data *data; - struct nfs_fh *mntfh; struct dentry *mntroot = ERR_PTR(-ENOMEM); int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, + .server = server, }; int error; - data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); - mntfh = nfs_alloc_fhandle(); - if (data == NULL || mntfh == NULL) - goto out; - - /* Validate the mount data */ - error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); - if (error < 0) { - mntroot = ERR_PTR(error); - goto out; - } - -#ifdef CONFIG_NFS_V4 - if (data->version == 4) { - mntroot = nfs4_try_mount(flags, dev_name, data); - goto out; - } -#endif /* CONFIG_NFS_V4 */ - - /* Get a volume representation */ - server = nfs_create_server(data, mntfh); - if (IS_ERR(server)) { - mntroot = ERR_CAST(server); - goto out; - } - sb_mntdata.server = server; - if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; @@ -2351,23 +2437,21 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ - nfs_fill_super(s, data); - nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL); + mount_info->fill_super(s, mount_info); + nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); } - mntroot = nfs_get_root(s, mntfh, dev_name); + mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); if (IS_ERR(mntroot)) goto error_splat_super; - error = security_sb_set_mnt_opts(s, &data->lsm_opts); + error = mount_info->set_security(s, mntroot, mount_info); if (error) goto error_splat_root; s->s_flags |= MS_ACTIVE; out: - nfs_free_parsed_mount_data(data); - nfs_free_fhandle(mntfh); return mntroot; out_err_nosb: @@ -2385,6 +2469,43 @@ error_splat_bdi: goto out; } +static struct dentry *nfs_fs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) +{ + struct nfs_mount_info mount_info = { + .fill_super = nfs_fill_super, + .set_security = nfs_set_sb_security, + }; + struct dentry *mntroot = ERR_PTR(-ENOMEM); + int error; + + mount_info.parsed = nfs_alloc_parsed_mount_data(); + mount_info.mntfh = nfs_alloc_fhandle(); + if (mount_info.parsed == NULL || mount_info.mntfh == NULL) + goto out; + + /* Validate the mount data */ + error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name); + if (error == NFS_TEXT_DATA) + error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name); + if (error < 0) { + mntroot = ERR_PTR(error); + goto out; + } + +#ifdef CONFIG_NFS_V4 + if (mount_info.parsed->version == 4) + mntroot = nfs4_try_mount(flags, dev_name, &mount_info); + else +#endif /* CONFIG_NFS_V4 */ + mntroot = nfs_try_mount(flags, dev_name, &mount_info); + +out: + nfs_free_parsed_mount_data(mount_info.parsed); + nfs_free_fhandle(mount_info.mntfh); + return mntroot; +} + /* * Ensure that we unregister the bdi before kill_anon_super * releases the device name @@ -2409,93 +2530,51 @@ static void nfs_kill_super(struct super_block *s) } /* - * Clone an NFS2/3 server record on xdev traversal (FSID-change) + * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) */ static struct dentry * -nfs_xdev_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) +nfs_xdev_mount_common(struct file_system_type *fs_type, int flags, + const char *dev_name, struct nfs_mount_info *mount_info) { - struct nfs_clone_mount *data = raw_data; - struct super_block *s; + struct nfs_clone_mount *data = mount_info->cloned; struct nfs_server *server; - struct dentry *mntroot; - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, - }; + struct dentry *mntroot = ERR_PTR(-ENOMEM); int error; - dprintk("--> nfs_xdev_mount()\n"); + dprintk("--> nfs_xdev_mount_common()\n"); + + mount_info->mntfh = data->fh; /* create a new volume representation */ server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); if (IS_ERR(server)) { error = PTR_ERR(server); - goto out_err_noserver; - } - sb_mntdata.server = server; - - if (server->flags & NFS_MOUNT_UNSHARED) - compare_super = NULL; - - /* -o noac implies -o sync */ - if (server->flags & NFS_MOUNT_NOAC) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; - - /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_err_nosb; - } - - if (s->s_fs_info != server) { - nfs_free_server(server); - server = NULL; - } else { - error = nfs_bdi_register(server); - if (error) - goto error_splat_bdi; - } - - if (!s->s_root) { - /* initial superblock/root creation */ - nfs_clone_super(s, data->sb); - nfs_fscache_get_super_cookie(s, NULL, data); - } - - mntroot = nfs_get_root(s, data->fh, dev_name); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); - goto error_splat_super; - } - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { - dput(mntroot); - error = -ESTALE; - goto error_splat_super; + goto out_err; } - s->s_flags |= MS_ACTIVE; - - /* clone any lsm security options from the parent to the new sb */ - security_sb_clone_mnt_opts(data->sb, s); - - dprintk("<-- nfs_xdev_mount() = 0\n"); + mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mount_info); + dprintk("<-- nfs_xdev_mount_common() = 0\n"); +out: return mntroot; -out_err_nosb: - nfs_free_server(server); -out_err_noserver: - dprintk("<-- nfs_xdev_mount() = %d [error]\n", error); - return ERR_PTR(error); +out_err: + dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error); + goto out; +} -error_splat_super: - if (server && !s->s_root) - bdi_unregister(&server->backing_dev_info); -error_splat_bdi: - deactivate_locked_super(s); - dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error); - return ERR_PTR(error); +/* + * Clone an NFS2/3 server record on xdev traversal (FSID-change) + */ +static struct dentry * +nfs_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + struct nfs_mount_info mount_info = { + .fill_super = nfs_clone_super, + .set_security = nfs_clone_sb_security, + .cloned = raw_data, + }; + return nfs_xdev_mount_common(&nfs_fs_type, flags, dev_name, &mount_info); } #ifdef CONFIG_NFS_V4 @@ -2504,8 +2583,9 @@ error_splat_bdi: * Finish setting up a cloned NFS4 superblock */ static void nfs4_clone_super(struct super_block *sb, - const struct super_block *old_sb) + struct nfs_mount_info *mount_info) { + const struct super_block *old_sb = mount_info->cloned->sb; sb->s_blocksize_bits = old_sb->s_blocksize_bits; sb->s_blocksize = old_sb->s_blocksize; sb->s_maxbytes = old_sb->s_maxbytes; @@ -2523,7 +2603,8 @@ static void nfs4_clone_super(struct super_block *sb, /* * Set up an NFS4 superblock */ -static void nfs4_fill_super(struct super_block *sb) +static void nfs4_fill_super(struct super_block *sb, + struct nfs_mount_info *mount_info) { sb->s_time_gran = 1; sb->s_op = &nfs4_sops; @@ -2542,37 +2623,6 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL); } -static int nfs4_validate_text_mount_data(void *options, - struct nfs_parsed_mount_data *args, - const char *dev_name) -{ - struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - - nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); - - nfs_validate_transport_protocol(args); - - nfs4_validate_mount_flags(args); - - if (args->version != 4) { - dfprintk(MOUNT, - "NFS4: Illegal mount version\n"); - return -EINVAL; - } - - if (args->auth_flavor_len > 1) { - dfprintk(MOUNT, - "NFS4: Too many RPC auth flavours specified\n"); - return -EINVAL; - } - - return nfs_parse_devname(dev_name, - &args->nfs_server.hostname, - NFS4_MAXNAMLEN, - &args->nfs_server.export_path, - NFS4_MAXPATHLEN); -} - /* * Validate NFSv4 mount options */ @@ -2643,13 +2693,7 @@ static int nfs4_validate_mount_data(void *options, break; default: - if (nfs_parse_mount_options((char *)options, args) == 0) - return -EINVAL; - - if (!nfs_verify_server_address(sap)) - return -EINVAL; - - return nfs4_validate_text_mount_data(options, args, dev_name); + return NFS_TEXT_DATA; } return 0; @@ -2673,91 +2717,26 @@ out_no_address: */ static struct dentry * nfs4_remote_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) + const char *dev_name, void *info) { - struct nfs_parsed_mount_data *data = raw_data; - struct super_block *s; + struct nfs_mount_info *mount_info = info; struct nfs_server *server; - struct nfs_fh *mntfh; - struct dentry *mntroot; - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, - }; - int error = -ENOMEM; + struct dentry *mntroot = ERR_PTR(-ENOMEM); - mntfh = nfs_alloc_fhandle(); - if (data == NULL || mntfh == NULL) - goto out; + mount_info->fill_super = nfs4_fill_super; + mount_info->set_security = nfs_set_sb_security; /* Get a volume representation */ - server = nfs4_create_server(data, mntfh); + server = nfs4_create_server(mount_info->parsed, mount_info->mntfh); if (IS_ERR(server)) { - error = PTR_ERR(server); + mntroot = ERR_CAST(server); goto out; } - sb_mntdata.server = server; - if (server->flags & NFS4_MOUNT_UNSHARED) - compare_super = NULL; - - /* -o noac implies -o sync */ - if (server->flags & NFS_MOUNT_NOAC) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; - - /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_free; - } - - if (s->s_fs_info != server) { - nfs_free_server(server); - server = NULL; - } else { - error = nfs_bdi_register(server); - if (error) - goto error_splat_bdi; - } - - if (!s->s_root) { - /* initial superblock/root creation */ - nfs4_fill_super(s); - nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL); - } - - mntroot = nfs4_get_root(s, mntfh, dev_name); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); - goto error_splat_super; - } - - error = security_sb_set_mnt_opts(s, &data->lsm_opts); - if (error) - goto error_splat_root; - - s->s_flags |= MS_ACTIVE; - - nfs_free_fhandle(mntfh); - return mntroot; + mntroot = nfs_fs_mount_common(fs_type, server, flags, dev_name, mount_info); out: - nfs_free_fhandle(mntfh); - return ERR_PTR(error); - -out_free: - nfs_free_server(server); - goto out; - -error_splat_root: - dput(mntroot); -error_splat_super: - if (server && !s->s_root) - bdi_unregister(&server->backing_dev_info); -error_splat_bdi: - deactivate_locked_super(s); - goto out; + return mntroot; } static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, @@ -2869,17 +2848,18 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, } static struct dentry *nfs4_try_mount(int flags, const char *dev_name, - struct nfs_parsed_mount_data *data) + struct nfs_mount_info *mount_info) { char *export_path; struct vfsmount *root_mnt; struct dentry *res; + struct nfs_parsed_mount_data *data = mount_info->parsed; dfprintk(MOUNT, "--> nfs4_try_mount()\n"); export_path = data->nfs_server.export_path; data->nfs_server.export_path = "/"; - root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, data->nfs_server.hostname); data->nfs_server.export_path = export_path; @@ -2891,38 +2871,6 @@ static struct dentry *nfs4_try_mount(int flags, const char *dev_name, return res; } -/* - * Get the superblock for an NFS4 mountpoint - */ -static struct dentry *nfs4_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data) -{ - struct nfs_parsed_mount_data *data; - int error = -ENOMEM; - struct dentry *res = ERR_PTR(-ENOMEM); - - data = nfs_alloc_parsed_mount_data(4); - if (data == NULL) - goto out; - - /* Validate the mount data */ - error = nfs4_validate_mount_data(raw_data, data, dev_name); - if (error < 0) { - res = ERR_PTR(error); - goto out; - } - - res = nfs4_try_mount(flags, dev_name, data); - if (IS_ERR(res)) - error = PTR_ERR(res); - -out: - nfs_free_parsed_mount_data(data); - dprintk("<-- nfs4_mount() = %d%s\n", error, - error != 0 ? " [error]" : ""); - return res; -} - static void nfs4_kill_super(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); @@ -2942,181 +2890,43 @@ static struct dentry * nfs4_xdev_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { - struct nfs_clone_mount *data = raw_data; - struct super_block *s; - struct nfs_server *server; - struct dentry *mntroot; - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, + struct nfs_mount_info mount_info = { + .fill_super = nfs4_clone_super, + .set_security = nfs_clone_sb_security, + .cloned = raw_data, }; - int error; - - dprintk("--> nfs4_xdev_mount()\n"); - - /* create a new volume representation */ - server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err_noserver; - } - sb_mntdata.server = server; - - if (server->flags & NFS4_MOUNT_UNSHARED) - compare_super = NULL; - - /* -o noac implies -o sync */ - if (server->flags & NFS_MOUNT_NOAC) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; - - /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_err_nosb; - } - - if (s->s_fs_info != server) { - nfs_free_server(server); - server = NULL; - } else { - error = nfs_bdi_register(server); - if (error) - goto error_splat_bdi; - } - - if (!s->s_root) { - /* initial superblock/root creation */ - nfs4_clone_super(s, data->sb); - nfs_fscache_get_super_cookie(s, NULL, data); - } - - mntroot = nfs4_get_root(s, data->fh, dev_name); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); - goto error_splat_super; - } - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { - dput(mntroot); - error = -ESTALE; - goto error_splat_super; - } - - s->s_flags |= MS_ACTIVE; - - security_sb_clone_mnt_opts(data->sb, s); - - dprintk("<-- nfs4_xdev_mount() = 0\n"); - return mntroot; - -out_err_nosb: - nfs_free_server(server); -out_err_noserver: - dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error); - return ERR_PTR(error); - -error_splat_super: - if (server && !s->s_root) - bdi_unregister(&server->backing_dev_info); -error_splat_bdi: - deactivate_locked_super(s); - dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error); - return ERR_PTR(error); + return nfs_xdev_mount_common(&nfs4_fs_type, flags, dev_name, &mount_info); } static struct dentry * nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { - struct nfs_clone_mount *data = raw_data; - struct super_block *s; - struct nfs_server *server; - struct dentry *mntroot; - struct nfs_fh *mntfh; - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, + struct nfs_mount_info mount_info = { + .fill_super = nfs4_fill_super, + .set_security = nfs_clone_sb_security, + .cloned = raw_data, }; - int error = -ENOMEM; + struct nfs_server *server; + struct dentry *mntroot = ERR_PTR(-ENOMEM); dprintk("--> nfs4_referral_get_sb()\n"); - mntfh = nfs_alloc_fhandle(); - if (mntfh == NULL) - goto out_err_nofh; + mount_info.mntfh = nfs_alloc_fhandle(); + if (mount_info.cloned == NULL || mount_info.mntfh == NULL) + goto out; /* create a new volume representation */ - server = nfs4_create_referral_server(data, mntfh); + server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh); if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err_noserver; - } - sb_mntdata.server = server; - - if (server->flags & NFS4_MOUNT_UNSHARED) - compare_super = NULL; - - /* -o noac implies -o sync */ - if (server->flags & NFS_MOUNT_NOAC) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; - - /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_err_nosb; - } - - if (s->s_fs_info != server) { - nfs_free_server(server); - server = NULL; - } else { - error = nfs_bdi_register(server); - if (error) - goto error_splat_bdi; - } - - if (!s->s_root) { - /* initial superblock/root creation */ - nfs4_fill_super(s); - nfs_fscache_get_super_cookie(s, NULL, data); - } - - mntroot = nfs4_get_root(s, mntfh, dev_name); - if (IS_ERR(mntroot)) { - error = PTR_ERR(mntroot); - goto error_splat_super; - } - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) { - dput(mntroot); - error = -ESTALE; - goto error_splat_super; + mntroot = ERR_CAST(server); + goto out; } - s->s_flags |= MS_ACTIVE; - - security_sb_clone_mnt_opts(data->sb, s); - - nfs_free_fhandle(mntfh); - dprintk("<-- nfs4_referral_get_sb() = 0\n"); + mntroot = nfs_fs_mount_common(&nfs4_fs_type, server, flags, dev_name, &mount_info); +out: + nfs_free_fhandle(mount_info.mntfh); return mntroot; - -out_err_nosb: - nfs_free_server(server); -out_err_noserver: - nfs_free_fhandle(mntfh); -out_err_nofh: - dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); - return ERR_PTR(error); - -error_splat_super: - if (server && !s->s_root) - bdi_unregister(&server->backing_dev_info); -error_splat_bdi: - deactivate_locked_super(s); - nfs_free_fhandle(mntfh); - dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); - return ERR_PTR(error); } /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c074623..e6fe3d6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -39,20 +39,20 @@ /* * Local function declarations */ -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc, - struct inode *inode, int ioflags); static void nfs_redirty_request(struct nfs_page *req); -static const struct rpc_call_ops nfs_write_partial_ops; -static const struct rpc_call_ops nfs_write_full_ops; +static const struct rpc_call_ops nfs_write_common_ops; static const struct rpc_call_ops nfs_commit_ops; +static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; +static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; +static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_write_data *nfs_commitdata_alloc(void) +struct nfs_commit_data *nfs_commitdata_alloc(void) { - struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); + struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); if (p) { memset(p, 0, sizeof(*p)); @@ -62,46 +62,73 @@ struct nfs_write_data *nfs_commitdata_alloc(void) } EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); -void nfs_commit_free(struct nfs_write_data *p) +void nfs_commit_free(struct nfs_commit_data *p) { - if (p && (p->pagevec != &p->page_array[0])) - kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } EXPORT_SYMBOL_GPL(nfs_commit_free); -struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) +struct nfs_write_header *nfs_writehdr_alloc(void) { - struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); + struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); if (p) { + struct nfs_pgio_header *hdr = &p->header; + memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); - p->npages = pagecount; - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); - if (!p->pagevec) { - mempool_free(p, nfs_wdata_mempool); - p = NULL; - } - } + INIT_LIST_HEAD(&hdr->pages); + INIT_LIST_HEAD(&hdr->rpc_list); + spin_lock_init(&hdr->lock); + atomic_set(&hdr->refcnt, 0); } return p; } -void nfs_writedata_free(struct nfs_write_data *p) +static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr, + unsigned int pagecount) +{ + struct nfs_write_data *data, *prealloc; + + prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data; + if (prealloc->header == NULL) + data = prealloc; + else + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + if (nfs_pgarray_set(&data->pages, pagecount)) { + data->header = hdr; + atomic_inc(&hdr->refcnt); + } else { + if (data != prealloc) + kfree(data); + data = NULL; + } +out: + return data; +} + +void nfs_writehdr_free(struct nfs_pgio_header *hdr) { - if (p && (p->pagevec != &p->page_array[0])) - kfree(p->pagevec); - mempool_free(p, nfs_wdata_mempool); + struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header); + mempool_free(whdr, nfs_wdata_mempool); } void nfs_writedata_release(struct nfs_write_data *wdata) { + struct nfs_pgio_header *hdr = wdata->header; + struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header); + put_nfs_open_context(wdata->args.context); - nfs_writedata_free(wdata); + if (wdata->pages.pagevec != wdata->pages.page_array) + kfree(wdata->pages.pagevec); + if (wdata != &write_header->rpc_data) + kfree(wdata); + else + wdata->header = NULL; + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); } static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) @@ -203,7 +230,6 @@ static int nfs_set_page_writeback(struct page *page) struct inode *inode = page->mapping->host; struct nfs_server *nfss = NFS_SERVER(inode); - page_cache_get(page); if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) { set_bdi_congested(&nfss->backing_dev_info, @@ -219,7 +245,6 @@ static void nfs_end_page_writeback(struct page *page) struct nfs_server *nfss = NFS_SERVER(inode); end_page_writeback(page); - page_cache_release(page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } @@ -235,10 +260,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo req = nfs_page_find_request_locked(page); if (req == NULL) break; - if (nfs_lock_request_dontget(req)) + if (nfs_lock_request(req)) break; /* Note: If we hold the page lock, as is the case in nfs_writepage, - * then the call to nfs_lock_request_dontget() will always + * then the call to nfs_lock_request() will always * succeed provided that someone hasn't already marked the * request as dirty (in which case we don't care). */ @@ -310,7 +335,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc struct nfs_pageio_descriptor pgio; int err; - nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), + &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) @@ -353,7 +379,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), + &nfs_async_write_completion_ops); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); @@ -379,7 +406,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); /* Lock the request! */ - nfs_lock_request_dontget(req); + nfs_lock_request(req); spin_lock(&inode->i_lock); if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE)) @@ -421,65 +448,88 @@ nfs_mark_request_dirty(struct nfs_page *req) /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page - * @head: commit list head + * @dst: commit list head + * @cinfo: holds list lock and accounting info * - * This sets the PG_CLEAN bit, updates the inode global count of + * This sets the PG_CLEAN bit, updates the cinfo count of * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must _not_ hold the inode->i_lock, but must be + * The caller must _not_ hold the cinfo->lock, but must be * holding the nfs_page lock. */ void -nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head) +nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo) { - struct inode *inode = req->wb_context->dentry->d_inode; - set_bit(PG_CLEAN, &(req)->wb_flags); - spin_lock(&inode->i_lock); - nfs_list_add_request(req, head); - NFS_I(inode)->ncommit++; - spin_unlock(&inode->i_lock); - inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + spin_lock(cinfo->lock); + nfs_list_add_request(req, dst); + cinfo->mds->ncommit++; + spin_unlock(cinfo->lock); + if (!cinfo->dreq) { + inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + inc_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); + __mark_inode_dirty(req->wb_context->dentry->d_inode, + I_DIRTY_DATASYNC); + } } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); /** * nfs_request_remove_commit_list - Remove request from a commit list * @req: pointer to a nfs_page + * @cinfo: holds list lock and accounting info * - * This clears the PG_CLEAN bit, and updates the inode global count of + * This clears the PG_CLEAN bit, and updates the cinfo's count of * number of outstanding requests requiring a commit * It does not update the MM page stats. * - * The caller _must_ hold the inode->i_lock and the nfs_page lock. + * The caller _must_ hold the cinfo->lock and the nfs_page lock. */ void -nfs_request_remove_commit_list(struct nfs_page *req) +nfs_request_remove_commit_list(struct nfs_page *req, + struct nfs_commit_info *cinfo) { - struct inode *inode = req->wb_context->dentry->d_inode; - if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) return; nfs_list_remove_request(req); - NFS_I(inode)->ncommit--; + cinfo->mds->ncommit--; } EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, + struct inode *inode) +{ + cinfo->lock = &inode->i_lock; + cinfo->mds = &NFS_I(inode)->commit_info; + cinfo->ds = pnfs_get_ds_info(inode); + cinfo->dreq = NULL; + cinfo->completion_ops = &nfs_commit_completion_ops; +} + +void nfs_init_cinfo(struct nfs_commit_info *cinfo, + struct inode *inode, + struct nfs_direct_req *dreq) +{ + if (dreq) + nfs_init_cinfo_from_dreq(cinfo, dreq); + else + nfs_init_cinfo_from_inode(cinfo, inode); +} +EXPORT_SYMBOL_GPL(nfs_init_cinfo); /* * Add a request to the inode's commit list. */ -static void -nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { - struct inode *inode = req->wb_context->dentry->d_inode; - - if (pnfs_mark_request_commit(req, lseg)) + if (pnfs_mark_request_commit(req, lseg, cinfo)) return; - nfs_request_add_commit_list(req, &NFS_I(inode)->commit_list); + nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); } static void @@ -494,11 +544,13 @@ nfs_clear_request_commit(struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct inode *inode = req->wb_context->dentry->d_inode; + struct nfs_commit_info cinfo; - if (!pnfs_clear_request_commit(req)) { - spin_lock(&inode->i_lock); - nfs_request_remove_commit_list(req); - spin_unlock(&inode->i_lock); + nfs_init_cinfo_from_inode(&cinfo, inode); + if (!pnfs_clear_request_commit(req, &cinfo)) { + spin_lock(cinfo.lock); + nfs_request_remove_commit_list(req, &cinfo); + spin_unlock(cinfo.lock); } nfs_clear_page_commit(req->wb_page); } @@ -508,28 +560,25 @@ static inline int nfs_write_need_commit(struct nfs_write_data *data) { if (data->verf.committed == NFS_DATA_SYNC) - return data->lseg == NULL; - else - return data->verf.committed != NFS_FILE_SYNC; + return data->header->lseg == NULL; + return data->verf.committed != NFS_FILE_SYNC; } -static inline -int nfs_reschedule_unstable_write(struct nfs_page *req, - struct nfs_write_data *data) +#else +static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, + struct inode *inode) { - if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { - nfs_mark_request_commit(req, data->lseg); - return 1; - } - if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { - nfs_mark_request_dirty(req); - return 1; - } - return 0; } -#else -static void -nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) + +void nfs_init_cinfo(struct nfs_commit_info *cinfo, + struct inode *inode, + struct nfs_direct_req *dreq) +{ +} + +void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { } @@ -544,25 +593,57 @@ int nfs_write_need_commit(struct nfs_write_data *data) return 0; } -static inline -int nfs_reschedule_unstable_write(struct nfs_page *req, - struct nfs_write_data *data) +#endif + +static void nfs_write_completion(struct nfs_pgio_header *hdr) { - return 0; + struct nfs_commit_info cinfo; + unsigned long bytes = 0; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) + goto out; + nfs_init_cinfo_from_inode(&cinfo, hdr->inode); + while (!list_empty(&hdr->pages)) { + struct nfs_page *req = nfs_list_entry(hdr->pages.next); + + bytes += req->wb_bytes; + nfs_list_remove_request(req); + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && + (hdr->good_bytes < bytes)) { + nfs_set_pageerror(req->wb_page); + nfs_context_set_write_error(req->wb_context, hdr->error); + goto remove_req; + } + if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { + nfs_mark_request_dirty(req); + goto next; + } + if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + nfs_mark_request_commit(req, hdr->lseg, &cinfo); + goto next; + } +remove_req: + nfs_inode_remove_request(req); +next: + nfs_unlock_request(req); + nfs_end_page_writeback(req->wb_page); + nfs_release_request(req); + } +out: + hdr->release(hdr); } -#endif #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -static int -nfs_need_commit(struct nfs_inode *nfsi) +static unsigned long +nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { - return nfsi->ncommit > 0; + return cinfo->mds->ncommit; } -/* i_lock held by caller */ -static int -nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max, - spinlock_t *lock) +/* cinfo->lock held by caller */ +int +nfs_scan_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max) { struct nfs_page *req, *tmp; int ret = 0; @@ -570,12 +651,13 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max, list_for_each_entry_safe(req, tmp, src, wb_list) { if (!nfs_lock_request(req)) continue; - if (cond_resched_lock(lock)) + kref_get(&req->wb_kref); + if (cond_resched_lock(cinfo->lock)) list_safe_reset_next(req, tmp, wb_list); - nfs_request_remove_commit_list(req); + nfs_request_remove_commit_list(req, cinfo); nfs_list_add_request(req, dst); ret++; - if (ret == max) + if ((ret == max) && !cinfo->dreq) break; } return ret; @@ -584,37 +666,38 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max, /* * nfs_scan_commit - Scan an inode for commit requests * @inode: NFS inode to scan - * @dst: destination list + * @dst: mds destination list + * @cinfo: mds and ds lists of reqs ready to commit * * Moves requests from the inode's 'commit' request list. * The requests are *not* checked to ensure that they form a contiguous set. */ -static int -nfs_scan_commit(struct inode *inode, struct list_head *dst) +int +nfs_scan_commit(struct inode *inode, struct list_head *dst, + struct nfs_commit_info *cinfo) { - struct nfs_inode *nfsi = NFS_I(inode); int ret = 0; - spin_lock(&inode->i_lock); - if (nfsi->ncommit > 0) { + spin_lock(cinfo->lock); + if (cinfo->mds->ncommit > 0) { const int max = INT_MAX; - ret = nfs_scan_commit_list(&nfsi->commit_list, dst, max, - &inode->i_lock); - ret += pnfs_scan_commit_lists(inode, max - ret, - &inode->i_lock); + ret = nfs_scan_commit_list(&cinfo->mds->list, dst, + cinfo, max); + ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(&inode->i_lock); + spin_unlock(cinfo->lock); return ret; } #else -static inline int nfs_need_commit(struct nfs_inode *nfsi) +static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { return 0; } -static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst) +int nfs_scan_commit(struct inode *inode, struct list_head *dst, + struct nfs_commit_info *cinfo) { return 0; } @@ -659,7 +742,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, || end < req->wb_offset) goto out_flushme; - if (nfs_lock_request_dontget(req)) + if (nfs_lock_request(req)) break; /* The request is locked, so wait and then retry */ @@ -729,7 +812,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_unlock_and_release_request(req); return 0; } @@ -766,10 +849,14 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static int nfs_write_pageuptodate(struct page *page, struct inode *inode) +static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) { - return PageUptodate(page) && - !(NFS_I(inode)->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA)); + if (nfs_have_delegated_attributes(inode)) + goto out; + if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) + return false; +out: + return PageUptodate(page) != 0; } /* @@ -815,17 +902,6 @@ int nfs_updatepage(struct file *file, struct page *page, return status; } -static void nfs_writepage_release(struct nfs_page *req, - struct nfs_write_data *data) -{ - struct page *page = req->wb_page; - - if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) - nfs_inode_remove_request(req); - nfs_unlock_request(req); - nfs_end_page_writeback(page); -} - static int flush_task_priority(int how) { switch (how & (FLUSH_HIGHPRI|FLUSH_LOWPRI)) { @@ -837,18 +913,18 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -int nfs_initiate_write(struct nfs_write_data *data, - struct rpc_clnt *clnt, +int nfs_initiate_write(struct rpc_clnt *clnt, + struct nfs_write_data *data, const struct rpc_call_ops *call_ops, - int how) + int how, int flags) { - struct inode *inode = data->inode; + struct inode *inode = data->header->inode; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = data->cred, + .rpc_cred = data->header->cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, @@ -857,7 +933,7 @@ int nfs_initiate_write(struct nfs_write_data *data, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; int ret = 0; @@ -892,26 +968,21 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write); /* * Set up the argument/result storage required for the RPC call. */ -static void nfs_write_rpcsetup(struct nfs_page *req, - struct nfs_write_data *data, +static void nfs_write_rpcsetup(struct nfs_write_data *data, unsigned int count, unsigned int offset, - int how) + int how, struct nfs_commit_info *cinfo) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct nfs_page *req = data->header->req; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ - data->req = req; - data->inode = inode = req->wb_context->dentry->d_inode; - data->cred = req->wb_context->cred; - - data->args.fh = NFS_FH(inode); + data->args.fh = NFS_FH(data->header->inode); data->args.offset = req_offset(req) + offset; /* pnfs_set_layoutcommit needs this */ data->mds_offset = data->args.offset; data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pagevec; + data->args.pages = data->pages.pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; @@ -920,7 +991,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req, case 0: break; case FLUSH_COND_STABLE: - if (nfs_need_commit(NFS_I(inode))) + if (nfs_reqs_to_commit(cinfo)) break; default: data->args.stable = NFS_FILE_SYNC; @@ -936,9 +1007,9 @@ static int nfs_do_write(struct nfs_write_data *data, const struct rpc_call_ops *call_ops, int how) { - struct inode *inode = data->args.context->dentry->d_inode; + struct inode *inode = data->header->inode; - return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0); } static int nfs_do_multiple_writes(struct list_head *head, @@ -951,7 +1022,7 @@ static int nfs_do_multiple_writes(struct list_head *head, while (!list_empty(head)) { int ret2; - data = list_entry(head->next, struct nfs_write_data, list); + data = list_first_entry(head, struct nfs_write_data, list); list_del_init(&data->list); ret2 = nfs_do_write(data, call_ops, how); @@ -967,31 +1038,60 @@ static int nfs_do_multiple_writes(struct list_head *head, */ static void nfs_redirty_request(struct nfs_page *req) { - struct page *page = req->wb_page; - nfs_mark_request_dirty(req); nfs_unlock_request(req); - nfs_end_page_writeback(page); + nfs_end_page_writeback(req->wb_page); + nfs_release_request(req); +} + +static void nfs_async_write_error(struct list_head *head) +{ + struct nfs_page *req; + + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_redirty_request(req); + } +} + +static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { + .error_cleanup = nfs_async_write_error, + .completion = nfs_write_completion, +}; + +static void nfs_flush_error(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + set_bit(NFS_IOHDR_REDO, &hdr->flags); + while (!list_empty(&hdr->rpc_list)) { + struct nfs_write_data *data = list_first_entry(&hdr->rpc_list, + struct nfs_write_data, list); + list_del(&data->list); + nfs_writedata_release(data); + } + desc->pg_completion_ops->error_cleanup(&desc->pg_list); } /* * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) { - struct nfs_page *req = nfs_list_entry(desc->pg_list.next); + struct nfs_page *req = hdr->req; struct page *page = req->wb_page; struct nfs_write_data *data; size_t wsize = desc->pg_bsize, nbytes; unsigned int offset; int requests = 0; - int ret = 0; + struct nfs_commit_info cinfo; - nfs_list_remove_request(req); + nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && - (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || + (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) || desc->pg_count > wsize)) desc->pg_ioflags &= ~FLUSH_COND_STABLE; @@ -1001,28 +1101,22 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head do { size_t len = min(nbytes, wsize); - data = nfs_writedata_alloc(1); - if (!data) - goto out_bad; - data->pagevec[0] = page; - nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags); - list_add(&data->list, res); + data = nfs_writedata_alloc(hdr, 1); + if (!data) { + nfs_flush_error(desc, hdr); + return -ENOMEM; + } + data->pages.pagevec[0] = page; + nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo); + list_add(&data->list, &hdr->rpc_list); requests++; nbytes -= len; offset += len; } while (nbytes != 0); - atomic_set(&req->wb_complete, requests); - desc->pg_rpc_callops = &nfs_write_partial_ops; - return ret; - -out_bad: - while (!list_empty(res)) { - data = list_entry(res->next, struct nfs_write_data, list); - list_del(&data->list); - nfs_writedata_release(data); - } - nfs_redirty_request(req); - return -ENOMEM; + nfs_list_remove_request(req); + nfs_list_add_request(req, &hdr->pages); + desc->pg_rpc_callops = &nfs_write_common_ops; + return 0; } /* @@ -1033,62 +1127,71 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) { struct nfs_page *req; struct page **pages; struct nfs_write_data *data; struct list_head *head = &desc->pg_list; - int ret = 0; + struct nfs_commit_info cinfo; - data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, - desc->pg_count)); + data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base, + desc->pg_count)); if (!data) { - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_redirty_request(req); - } - ret = -ENOMEM; - goto out; + nfs_flush_error(desc, hdr); + return -ENOMEM; } - pages = data->pagevec; + + nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); + pages = data->pages.pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - nfs_list_add_request(req, &data->pages); + nfs_list_add_request(req, &hdr->pages); *pages++ = req->wb_page; } - req = nfs_list_entry(data->pages.next); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && - (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) + (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags); - list_add(&data->list, res); - desc->pg_rpc_callops = &nfs_write_full_ops; -out: - return ret; + nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); + list_add(&data->list, &hdr->rpc_list); + desc->pg_rpc_callops = &nfs_write_common_ops; + return 0; } -int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head) +int nfs_generic_flush(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) { if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_flush_multi(desc, head); - return nfs_flush_one(desc, head); + return nfs_flush_multi(desc, hdr); + return nfs_flush_one(desc, hdr); } static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - LIST_HEAD(head); + struct nfs_write_header *whdr; + struct nfs_pgio_header *hdr; int ret; - ret = nfs_generic_flush(desc, &head); + whdr = nfs_writehdr_alloc(); + if (!whdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; + } + hdr = &whdr->header; + nfs_pgheader_init(desc, hdr, nfs_writehdr_free); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_flush(desc, hdr); if (ret == 0) - ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops, - desc->pg_ioflags); + ret = nfs_do_multiple_writes(&hdr->rpc_list, + desc->pg_rpc_callops, + desc->pg_ioflags); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); return ret; } @@ -1098,9 +1201,10 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = { }; void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags) + struct inode *inode, int ioflags, + const struct nfs_pgio_completion_ops *compl_ops) { - nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, + nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops, NFS_SERVER(inode)->wsize, ioflags); } @@ -1111,80 +1215,27 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags) +void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags, + const struct nfs_pgio_completion_ops *compl_ops) { - if (!pnfs_pageio_init_write(pgio, inode, ioflags)) - nfs_pageio_init_write_mds(pgio, inode, ioflags); + if (!pnfs_pageio_init_write(pgio, inode, ioflags, compl_ops)) + nfs_pageio_init_write_mds(pgio, inode, ioflags, compl_ops); } -/* - * Handle a write reply that flushed part of a page. - */ -static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) +void nfs_write_prepare(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = calldata; - - dprintk("NFS: %5u write(%s/%lld %d@%lld)", - task->tk_pid, - data->req->wb_context->dentry->d_inode->i_sb->s_id, - (long long) - NFS_FILEID(data->req->wb_context->dentry->d_inode), - data->req->wb_bytes, (long long)req_offset(data->req)); - - nfs_writeback_done(task, data); + struct nfs_write_data *data = calldata; + NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); } -static void nfs_writeback_release_partial(void *calldata) +void nfs_commit_prepare(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = calldata; - struct nfs_page *req = data->req; - struct page *page = req->wb_page; - int status = data->task.tk_status; + struct nfs_commit_data *data = calldata; - if (status < 0) { - nfs_set_pageerror(page); - nfs_context_set_write_error(req->wb_context, status); - dprintk(", error = %d\n", status); - goto out; - } - - if (nfs_write_need_commit(data)) { - struct inode *inode = page->mapping->host; - - spin_lock(&inode->i_lock); - if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) { - /* Do nothing we need to resend the writes */ - } else if (!test_and_set_bit(PG_NEED_COMMIT, &req->wb_flags)) { - memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - dprintk(" defer commit\n"); - } else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) { - set_bit(PG_NEED_RESCHED, &req->wb_flags); - clear_bit(PG_NEED_COMMIT, &req->wb_flags); - dprintk(" server reboot detected\n"); - } - spin_unlock(&inode->i_lock); - } else - dprintk(" OK\n"); - -out: - if (atomic_dec_and_test(&req->wb_complete)) - nfs_writepage_release(req, data); - nfs_writedata_release(calldata); + NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -void nfs_write_prepare(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; - NFS_PROTO(data->inode)->write_rpc_prepare(task, data); -} - -static const struct rpc_call_ops nfs_write_partial_ops = { - .rpc_call_prepare = nfs_write_prepare, - .rpc_call_done = nfs_writeback_done_partial, - .rpc_release = nfs_writeback_release_partial, -}; - /* * Handle a write reply that flushes a whole page. * @@ -1192,59 +1243,37 @@ static const struct rpc_call_ops nfs_write_partial_ops = { * writebacks since the page->count is kept > 1 for as long * as the page has a write request pending. */ -static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) +static void nfs_writeback_done_common(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; nfs_writeback_done(task, data); } -static void nfs_writeback_release_full(void *calldata) +static void nfs_writeback_release_common(void *calldata) { struct nfs_write_data *data = calldata; + struct nfs_pgio_header *hdr = data->header; int status = data->task.tk_status; + struct nfs_page *req = hdr->req; - /* Update attributes as result of writeback. */ - while (!list_empty(&data->pages)) { - struct nfs_page *req = nfs_list_entry(data->pages.next); - struct page *page = req->wb_page; - - nfs_list_remove_request(req); - - dprintk("NFS: %5u write (%s/%lld %d@%lld)", - data->task.tk_pid, - req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), - req->wb_bytes, - (long long)req_offset(req)); - - if (status < 0) { - nfs_set_pageerror(page); - nfs_context_set_write_error(req->wb_context, status); - dprintk(", error = %d\n", status); - goto remove_request; - } - - if (nfs_write_need_commit(data)) { + if ((status >= 0) && nfs_write_need_commit(data)) { + spin_lock(&hdr->lock); + if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) + ; /* Do nothing */ + else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - nfs_mark_request_commit(req, data->lseg); - dprintk(" marked for commit\n"); - goto next; - } - dprintk(" OK\n"); -remove_request: - nfs_inode_remove_request(req); - next: - nfs_unlock_request(req); - nfs_end_page_writeback(page); + else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) + set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); + spin_unlock(&hdr->lock); } - nfs_writedata_release(calldata); + nfs_writedata_release(data); } -static const struct rpc_call_ops nfs_write_full_ops = { +static const struct rpc_call_ops nfs_write_common_ops = { .rpc_call_prepare = nfs_write_prepare, - .rpc_call_done = nfs_writeback_done_full, - .rpc_release = nfs_writeback_release_full, + .rpc_call_done = nfs_writeback_done_common, + .rpc_release = nfs_writeback_release_common, }; @@ -1255,6 +1284,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; + struct inode *inode = data->header->inode; int status; dprintk("NFS: %5u nfs_writeback_done (status %d)\n", @@ -1267,10 +1297,10 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) * another writer had changed the file, but some applications * depend on tighter cache coherency when writing. */ - status = NFS_PROTO(data->inode)->write_done(task, data); + status = NFS_PROTO(inode)->write_done(task, data); if (status != 0) return; - nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (resp->verf->committed < argp->stable && task->tk_status >= 0) { @@ -1288,46 +1318,47 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - NFS_SERVER(data->inode)->nfs_client->cl_hostname, + NFS_SERVER(inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); complain = jiffies + 300 * HZ; } } #endif - /* Is this a short write? */ - if (task->tk_status >= 0 && resp->count < argp->count) { + if (task->tk_status < 0) + nfs_set_pgio_error(data->header, task->tk_status, argp->offset); + else if (resp->count < argp->count) { static unsigned long complain; - nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + /* This a short write! */ + nfs_inc_stats(inode, NFSIOS_SHORTWRITE); /* Has the server at least made some progress? */ - if (resp->count != 0) { - /* Was this an NFSv2 write or an NFSv3 stable write? */ - if (resp->verf->committed != NFS_UNSTABLE) { - /* Resend from where the server left off */ - data->mds_offset += resp->count; - argp->offset += resp->count; - argp->pgbase += resp->count; - argp->count -= resp->count; - } else { - /* Resend as a stable write in order to avoid - * headaches in the case of a server crash. - */ - argp->stable = NFS_FILE_SYNC; + if (resp->count == 0) { + if (time_before(complain, jiffies)) { + printk(KERN_WARNING + "NFS: Server wrote zero bytes, expected %u.\n", + argp->count); + complain = jiffies + 300 * HZ; } - rpc_restart_call_prepare(task); + nfs_set_pgio_error(data->header, -EIO, argp->offset); + task->tk_status = -EIO; return; } - if (time_before(complain, jiffies)) { - printk(KERN_WARNING - "NFS: Server wrote zero bytes, expected %u.\n", - argp->count); - complain = jiffies + 300 * HZ; + /* Was this an NFSv2 write or an NFSv3 stable write? */ + if (resp->verf->committed != NFS_UNSTABLE) { + /* Resend from where the server left off */ + data->mds_offset += resp->count; + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; + } else { + /* Resend as a stable write in order to avoid + * headaches in the case of a server crash. + */ + argp->stable = NFS_FILE_SYNC; } - /* Can't do anything about it except throw an error. */ - task->tk_status = -EIO; + rpc_restart_call_prepare(task); } - return; } @@ -1347,26 +1378,23 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) return (ret < 0) ? ret : 1; } -void nfs_commit_clear_lock(struct nfs_inode *nfsi) +static void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); smp_mb__after_clear_bit(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } -EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); -void nfs_commitdata_release(void *data) +void nfs_commitdata_release(struct nfs_commit_data *data) { - struct nfs_write_data *wdata = data; - - put_nfs_open_context(wdata->args.context); - nfs_commit_free(wdata); + put_nfs_open_context(data->context); + nfs_commit_free(data); } EXPORT_SYMBOL_GPL(nfs_commitdata_release); -int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, +int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct rpc_call_ops *call_ops, - int how) + int how, int flags) { struct rpc_task *task; int priority = flush_task_priority(how); @@ -1382,7 +1410,7 @@ int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; /* Set up the initial task struct. */ @@ -1403,9 +1431,10 @@ EXPORT_SYMBOL_GPL(nfs_initiate_commit); /* * Set up the argument/result storage required for the RPC call. */ -void nfs_init_commit(struct nfs_write_data *data, - struct list_head *head, - struct pnfs_layout_segment *lseg) +void nfs_init_commit(struct nfs_commit_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { struct nfs_page *first = nfs_list_entry(head->next); struct inode *inode = first->wb_context->dentry->d_inode; @@ -1419,13 +1448,14 @@ void nfs_init_commit(struct nfs_write_data *data, data->cred = first->wb_context->cred; data->lseg = lseg; /* reference transferred */ data->mds_ops = &nfs_commit_ops; + data->completion_ops = cinfo->completion_ops; + data->dreq = cinfo->dreq; data->args.fh = NFS_FH(data->inode); /* Note: we always request a commit of the entire inode */ data->args.offset = 0; data->args.count = 0; - data->args.context = get_nfs_open_context(first->wb_context); - data->res.count = 0; + data->context = get_nfs_open_context(first->wb_context); data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); @@ -1433,18 +1463,21 @@ void nfs_init_commit(struct nfs_write_data *data, EXPORT_SYMBOL_GPL(nfs_init_commit); void nfs_retry_commit(struct list_head *page_list, - struct pnfs_layout_segment *lseg) + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo) { struct nfs_page *req; while (!list_empty(page_list)) { req = nfs_list_entry(page_list->next); nfs_list_remove_request(req); - nfs_mark_request_commit(req, lseg); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); - nfs_unlock_request(req); + nfs_mark_request_commit(req, lseg, cinfo); + if (!cinfo->dreq) { + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); + } + nfs_unlock_and_release_request(req); } } EXPORT_SYMBOL_GPL(nfs_retry_commit); @@ -1453,9 +1486,10 @@ EXPORT_SYMBOL_GPL(nfs_retry_commit); * Commit dirty pages */ static int -nfs_commit_list(struct inode *inode, struct list_head *head, int how) +nfs_commit_list(struct inode *inode, struct list_head *head, int how, + struct nfs_commit_info *cinfo) { - struct nfs_write_data *data; + struct nfs_commit_data *data; data = nfs_commitdata_alloc(); @@ -1463,11 +1497,13 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) goto out_bad; /* Set up the argument struct */ - nfs_init_commit(data, head, NULL); - return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); + nfs_init_commit(data, head, NULL, cinfo); + atomic_inc(&cinfo->mds->rpcs_out); + return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, + how, 0); out_bad: - nfs_retry_commit(head, NULL); - nfs_commit_clear_lock(NFS_I(inode)); + nfs_retry_commit(head, NULL, cinfo); + cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } @@ -1476,7 +1512,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) */ static void nfs_commit_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = calldata; + struct nfs_commit_data *data = calldata; dprintk("NFS: %5u nfs_commit_done (status %d)\n", task->tk_pid, task->tk_status); @@ -1485,10 +1521,11 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_done(task, data); } -void nfs_commit_release_pages(struct nfs_write_data *data) +static void nfs_commit_release_pages(struct nfs_commit_data *data) { struct nfs_page *req; int status = data->task.tk_status; + struct nfs_commit_info cinfo; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1519,42 +1556,59 @@ void nfs_commit_release_pages(struct nfs_write_data *data) dprintk(" mismatch\n"); nfs_mark_request_dirty(req); next: - nfs_unlock_request(req); + nfs_unlock_and_release_request(req); } + nfs_init_cinfo(&cinfo, data->inode, data->dreq); + if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) + nfs_commit_clear_lock(NFS_I(data->inode)); } -EXPORT_SYMBOL_GPL(nfs_commit_release_pages); static void nfs_commit_release(void *calldata) { - struct nfs_write_data *data = calldata; + struct nfs_commit_data *data = calldata; - nfs_commit_release_pages(data); - nfs_commit_clear_lock(NFS_I(data->inode)); + data->completion_ops->completion(data); nfs_commitdata_release(calldata); } static const struct rpc_call_ops nfs_commit_ops = { - .rpc_call_prepare = nfs_write_prepare, + .rpc_call_prepare = nfs_commit_prepare, .rpc_call_done = nfs_commit_done, .rpc_release = nfs_commit_release, }; +static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { + .completion = nfs_commit_release_pages, + .error_cleanup = nfs_commit_clear_lock, +}; + +int nfs_generic_commit_list(struct inode *inode, struct list_head *head, + int how, struct nfs_commit_info *cinfo) +{ + int status; + + status = pnfs_commit_list(inode, head, how, cinfo); + if (status == PNFS_NOT_ATTEMPTED) + status = nfs_commit_list(inode, head, how, cinfo); + return status; +} + int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); + struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; int res; res = nfs_commit_set_lock(NFS_I(inode), may_wait); if (res <= 0) goto out_mark_dirty; - res = nfs_scan_commit(inode, &head); + nfs_init_cinfo_from_inode(&cinfo, inode); + res = nfs_scan_commit(inode, &head, &cinfo); if (res) { int error; - error = pnfs_commit_list(inode, &head, how); - if (error == PNFS_NOT_ATTEMPTED) - error = nfs_commit_list(inode, &head, how); + error = nfs_generic_commit_list(inode, &head, how, &cinfo); if (error < 0) return error; if (!may_wait) @@ -1585,14 +1639,14 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int ret = 0; /* no commits means nothing needs to be done */ - if (!nfsi->ncommit) + if (!nfsi->commit_info.ncommit) return ret; if (wbc->sync_mode == WB_SYNC_NONE) { /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. */ - if (nfsi->ncommit <= (nfsi->npages >> 1)) + if (nfsi->commit_info.ncommit <= (nfsi->npages >> 1)) goto out_mark_dirty; /* don't wait for the COMMIT response */ @@ -1665,7 +1719,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) req = nfs_page_find_request(page); if (req == NULL) break; - if (nfs_lock_request_dontget(req)) { + if (nfs_lock_request(req)) { nfs_clear_request_commit(req); nfs_inode_remove_request(req); /* @@ -1673,7 +1727,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) * page as being dirty */ cancel_dirty_page(page, PAGE_CACHE_SIZE); - nfs_unlock_request(req); + nfs_unlock_and_release_request(req); break; } ret = nfs_wait_on_request(req); @@ -1742,7 +1796,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", - sizeof(struct nfs_write_data), + sizeof(struct nfs_write_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_wdata_cachep == NULL) @@ -1753,6 +1807,13 @@ int __init nfs_init_writepagecache(void) if (nfs_wdata_mempool == NULL) return -ENOMEM; + nfs_cdata_cachep = kmem_cache_create("nfs_commit_data", + sizeof(struct nfs_commit_data), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (nfs_cdata_cachep == NULL) + return -ENOMEM; + nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, nfs_wdata_cachep); if (nfs_commit_mempool == NULL) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 79717a4..204438c 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -1,6 +1,7 @@ /* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched.h> +#include <linux/user_namespace.h> #include "nfsd.h" #include "auth.h" @@ -56,8 +57,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) goto oom; for (i = 0; i < rqgi->ngroups; i++) { - if (!GROUP_AT(rqgi, i)) - GROUP_AT(gi, i) = exp->ex_anon_gid; + if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) + GROUP_AT(gi, i) = make_kgid(&init_user_ns, exp->ex_anon_gid); else GROUP_AT(gi, i) = GROUP_AT(rqgi, i); } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 8f7b95a..7cc6446 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -734,7 +734,7 @@ void nilfs_evict_inode(struct inode *inode) if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); nilfs_clear_inode(inode); return; } @@ -746,7 +746,7 @@ void nilfs_evict_inode(struct inode *inode) /* TODO: some of the following operations may fail. */ nilfs_truncate_bmap(ii, 0); nilfs_mark_inode_dirty(inode); - end_writeback(inode); + clear_inode(inode); ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); if (!ret) diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index fce2bbe..0bb2c20 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -441,7 +441,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child) { unsigned long ino; struct inode *inode; - struct qstr dotdot = {.name = "..", .len = 2}; + struct qstr dotdot = QSTR_INIT("..", 2); struct nilfs_root *root; ino = nilfs_inode_by_name(child->d_inode, &dotdot); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 2eaa666..c6dbd3d 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2258,7 +2258,7 @@ void ntfs_evict_big_inode(struct inode *vi) ntfs_inode *ni = NTFS_I(vi); truncate_inode_pages(&vi->i_data, 0); - end_writeback(vi); + clear_inode(vi); #ifdef NTFS_RW if (NInoDirty(ni)) { diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 044e7b5..1bfe880 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -2005,7 +2005,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) o2net_listen_sock = sock; INIT_WORK(&o2net_listen_work, o2net_accept_many); - sock->sk->sk_reuse = 1; + sock->sk->sk_reuse = SK_CAN_REUSE; ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); if (ret < 0) { printk(KERN_ERR "o2net: Error %d while binding socket at " diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 3b5825e..e31d6ae 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -367,7 +367,7 @@ static void dlmfs_evict_inode(struct inode *inode) int status; struct dlmfs_inode_private *ip; - end_writeback(inode); + clear_inode(inode); mlog(0, "inode %lu\n", inode->i_ino); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 17454a9..735514c 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1069,7 +1069,7 @@ static void ocfs2_clear_inode(struct inode *inode) int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); - end_writeback(inode); + clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, inode->i_nlink); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index dbc8422..e6213b3 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -184,7 +184,7 @@ int omfs_sync_inode(struct inode *inode) static void omfs_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); if (inode->i_nlink) return; @@ -316,7 +316,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ - if (override_cred->uid) + kuid_t root_uid = make_kuid(override_cred->user_ns, 0); + if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = @@ -505,15 +506,24 @@ static int chown_common(struct path *path, uid_t user, gid_t group) struct inode *inode = path->dentry->d_inode; int error; struct iattr newattrs; + kuid_t uid; + kgid_t gid; + + uid = make_kuid(current_user_ns(), user); + gid = make_kgid(current_user_ns(), group); newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { + if (!uid_valid(uid)) + return -EINVAL; newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = user; + newattrs.ia_uid = uid; } if (group != (gid_t) -1) { + if (!gid_valid(gid)) + return -EINVAL; newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = group; + newattrs.ia_gid = gid; } if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= @@ -681,7 +691,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, f->f_op = fops_get(inode->i_fop); - error = security_dentry_open(f, cred); + error = security_file_open(f, cred); if (error) goto cleanup_all; diff --git a/fs/proc/array.c b/fs/proc/array.c index f9bd395..dc4c5a7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -81,6 +81,7 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/tracehook.h> +#include <linux/user_namespace.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -161,6 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { + struct user_namespace *user_ns = current_user_ns(); struct group_info *group_info; int g; struct fdtable *fdt = NULL; @@ -189,8 +191,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_tgid_nr_ns(p, ns), pid_nr_ns(pid, ns), ppid, tpid, - cred->uid, cred->euid, cred->suid, cred->fsuid, - cred->gid, cred->egid, cred->sgid, cred->fsgid); + from_kuid_munged(user_ns, cred->uid), + from_kuid_munged(user_ns, cred->euid), + from_kuid_munged(user_ns, cred->suid), + from_kuid_munged(user_ns, cred->fsuid), + from_kgid_munged(user_ns, cred->gid), + from_kgid_munged(user_ns, cred->egid), + from_kgid_munged(user_ns, cred->sgid), + from_kgid_munged(user_ns, cred->fsgid)); task_lock(p); if (p->files) @@ -205,7 +213,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_unlock(p); for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) - seq_printf(m, "%d ", GROUP_AT(group_info, g)); + seq_printf(m, "%d ", + from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); seq_putc(m, '\n'); diff --git a/fs/proc/base.c b/fs/proc/base.c index 1c8b280..d2d3108 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -81,6 +81,7 @@ #include <linux/oom.h> #include <linux/elf.h> #include <linux/pid_namespace.h> +#include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/flex_array.h> @@ -1561,8 +1562,8 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) generic_fillattr(inode, stat); rcu_read_lock(); - stat->uid = 0; - stat->gid = 0; + stat->uid = GLOBAL_ROOT_UID; + stat->gid = GLOBAL_ROOT_GID; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(pid, task, 2)) { @@ -1622,8 +1623,8 @@ int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); @@ -1799,10 +1800,15 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) if (task) { files = get_files_struct(task); if (files) { + struct file *file; rcu_read_lock(); - if (fcheck_files(files, fd)) { + file = fcheck_files(files, fd); + if (file) { + unsigned i_mode, f_mode = file->f_mode; + rcu_read_unlock(); put_files_struct(files); + if (task_dumpable(task)) { rcu_read_lock(); cred = __task_cred(task); @@ -1810,10 +1816,17 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } - inode->i_mode &= ~(S_ISUID | S_ISGID); + + i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + security_task_to_inode(task, inode); put_task_struct(task); return 1; @@ -1837,8 +1850,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { unsigned fd = *(const unsigned *)ptr; - struct file *file; - struct files_struct *files; struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); @@ -1848,25 +1859,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, goto out; ei = PROC_I(inode); ei->fd = fd; - files = get_files_struct(task); - if (!files) - goto out_iput; - inode->i_mode = S_IFLNK; - - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (!file) - goto out_unlock; - if (file->f_mode & FMODE_READ) - inode->i_mode |= S_IRUSR | S_IXUSR; - if (file->f_mode & FMODE_WRITE) - inode->i_mode |= S_IWUSR | S_IXUSR; - spin_unlock(&files->file_lock); - put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1879,12 +1871,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, out: return error; -out_unlock: - spin_unlock(&files->file_lock); - put_files_struct(files); -out_iput: - iput(inode); - goto out; } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -2060,8 +2046,8 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } security_task_to_inode(task, inode); status = 1; @@ -2177,16 +2163,16 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out; result = ERR_PTR(-EACCES); - if (lock_trace(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) - goto out_unlock; + goto out_put_task; mm = get_task_mm(task); if (!mm) - goto out_unlock; + goto out_put_task; down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); @@ -2198,8 +2184,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, out_no_vma: up_read(&mm->mmap_sem); mmput(mm); -out_unlock: - unlock_trace(task); out_put_task: put_task_struct(task); out: @@ -2233,7 +2217,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; ret = -EACCES; - if (lock_trace(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; ret = 0; @@ -2241,12 +2225,12 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) case 0: ino = inode->i_ino; if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) - goto out_unlock; + goto out_put_task; filp->f_pos++; case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out_unlock; + goto out_put_task; filp->f_pos++; default: { @@ -2257,7 +2241,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) mm = get_task_mm(task); if (!mm) - goto out_unlock; + goto out_put_task; down_read(&mm->mmap_sem); nr_files = 0; @@ -2287,7 +2271,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) flex_array_free(fa); up_read(&mm->mmap_sem); mmput(mm); - goto out_unlock; + goto out_put_task; } for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { @@ -2332,8 +2316,6 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) } } -out_unlock: - unlock_trace(task); out_put_task: put_task_struct(task); out: @@ -2943,6 +2925,74 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) } #endif /* CONFIG_TASK_IO_ACCOUNTING */ +#ifdef CONFIG_USER_NS +static int proc_id_map_open(struct inode *inode, struct file *file, + struct seq_operations *seq_ops) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + struct seq_file *seq; + int ret = -EINVAL; + + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + ret = seq_open(file, seq_ops); + if (ret) + goto err_put_ns; + + seq = file->private_data; + seq->private = ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_id_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + put_user_ns(ns); + return seq_release(inode, file); +} + +static int proc_uid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_uid_seq_operations); +} + +static int proc_gid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_gid_seq_operations); +} + +static const struct file_operations proc_uid_map_operations = { + .open = proc_uid_map_open, + .write = proc_uid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_gid_map_operations = { + .open = proc_gid_map_open, + .write = proc_gid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; +#endif /* CONFIG_USER_NS */ + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3045,6 +3095,10 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), #endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), +#endif }; static int proc_tgid_base_readdir(struct file * filp, @@ -3400,6 +3454,10 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), #endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), +#endif }; static int proc_tid_base_readdir(struct file * filp, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 205c922..7ac817b 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -33,7 +33,7 @@ static void proc_evict_inode(struct inode *inode) const struct proc_ns_operations *ns_ops; truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); /* Stop tracking associated processes */ put_pid(PROC_I(inode)->pid); @@ -108,8 +108,8 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) struct super_block *sb = root->d_sb; struct pid_namespace *pid = sb->s_fs_info; - if (pid->pid_gid) - seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); if (pid->hide_pid != 0) seq_printf(seq, ",hidepid=%u", pid->hide_pid); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 21d836f..3476bca 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -371,9 +371,9 @@ void register_sysctl_root(struct ctl_table_root *root) static int test_perm(int mode, int op) { - if (!current_euid()) + if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) mode >>= 6; - else if (in_egroup_p(0)) + else if (in_egroup_p(GLOBAL_ROOT_GID)) mode >>= 3; if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; diff --git a/fs/proc/root.c b/fs/proc/root.c index eed44bf..7c30fce 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -67,7 +67,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) case Opt_gid: if (match_int(&args[0], &option)) return 0; - pid->pid_gid = option; + pid->pid_gid = make_kgid(current_user_ns(), option); break; case Opt_hidepid: if (match_int(&args[0], &option)) diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8007ae7..23ade26 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -11,3 +11,20 @@ config PSTORE (e.g. ACPI_APEI on X86) which will select this for you. If you don't have a platform persistent store driver, say N. + +config PSTORE_RAM + tristate "Log panic/oops to a RAM buffer" + depends on PSTORE + depends on HAS_IOMEM + depends on HAVE_MEMBLOCK + select REED_SOLOMON + select REED_SOLOMON_ENC8 + select REED_SOLOMON_DEC8 + help + This enables panic and oops messages to be logged to a circular + buffer in RAM where it can be read back at some later point. + + Note that for historical reasons, the module will be named + "ramoops.ko". + + For more information, see Documentation/ramoops.txt. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 760f4bc..278a44e 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -5,3 +5,6 @@ obj-y += pstore.o pstore-objs += inode.o platform.o + +ramoops-objs += ram.o ram_core.o +obj-$(CONFIG_PSTORE_RAM) += ramoops.o diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1950788..aeb19e6 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -85,7 +85,7 @@ static void pstore_evict_inode(struct inode *inode) struct pstore_private *p = inode->i_private; unsigned long flags; - end_writeback(inode); + clear_inode(inode); if (p) { spin_lock_irqsave(&allpstore_lock, flags); list_del(&p->list); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c new file mode 100644 index 0000000..9123cce --- /dev/null +++ b/fs/pstore/ram.c @@ -0,0 +1,383 @@ +/* + * RAM Oops/Panic logger + * + * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com> + * Copyright (C) 2011 Kees Cook <keescook@chromium.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/pstore.h> +#include <linux/time.h> +#include <linux/io.h> +#include <linux/ioport.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/pstore_ram.h> + +#define RAMOOPS_KERNMSG_HDR "====" +#define MIN_MEM_SIZE 4096UL + +static ulong record_size = MIN_MEM_SIZE; +module_param(record_size, ulong, 0400); +MODULE_PARM_DESC(record_size, + "size of each dump done on oops/panic"); + +static ulong mem_address; +module_param(mem_address, ulong, 0400); +MODULE_PARM_DESC(mem_address, + "start of reserved RAM used to store oops/panic logs"); + +static ulong mem_size; +module_param(mem_size, ulong, 0400); +MODULE_PARM_DESC(mem_size, + "size of reserved RAM used to store oops/panic logs"); + +static int dump_oops = 1; +module_param(dump_oops, int, 0600); +MODULE_PARM_DESC(dump_oops, + "set to 1 to dump oopses, 0 to only dump panics (default 1)"); + +static int ramoops_ecc; +module_param_named(ecc, ramoops_ecc, int, 0600); +MODULE_PARM_DESC(ramoops_ecc, + "set to 1 to enable ECC support"); + +struct ramoops_context { + struct persistent_ram_zone **przs; + phys_addr_t phys_addr; + unsigned long size; + size_t record_size; + int dump_oops; + bool ecc; + unsigned int count; + unsigned int max_count; + unsigned int read_count; + struct pstore_info pstore; +}; + +static struct platform_device *dummy; +static struct ramoops_platform_data *dummy_data; + +static int ramoops_pstore_open(struct pstore_info *psi) +{ + struct ramoops_context *cxt = psi->data; + + cxt->read_count = 0; + return 0; +} + +static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, + struct timespec *time, + char **buf, + struct pstore_info *psi) +{ + ssize_t size; + struct ramoops_context *cxt = psi->data; + struct persistent_ram_zone *prz; + + if (cxt->read_count >= cxt->max_count) + return -EINVAL; + + *id = cxt->read_count++; + prz = cxt->przs[*id]; + + /* Only supports dmesg output so far. */ + *type = PSTORE_TYPE_DMESG; + /* TODO(kees): Bogus time for the moment. */ + time->tv_sec = 0; + time->tv_nsec = 0; + + size = persistent_ram_old_size(prz); + *buf = kmalloc(size, GFP_KERNEL); + if (*buf == NULL) + return -ENOMEM; + memcpy(*buf, persistent_ram_old(prz), size); + + return size; +} + +static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) +{ + char *hdr; + struct timeval timestamp; + size_t len; + + do_gettimeofday(×tamp); + hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", + (long)timestamp.tv_sec, (long)timestamp.tv_usec); + WARN_ON_ONCE(!hdr); + len = hdr ? strlen(hdr) : 0; + persistent_ram_write(prz, hdr, len); + kfree(hdr); + + return len; +} + +static int ramoops_pstore_write(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, + unsigned int part, + size_t size, struct pstore_info *psi) +{ + struct ramoops_context *cxt = psi->data; + struct persistent_ram_zone *prz = cxt->przs[cxt->count]; + size_t hlen; + + /* Currently ramoops is designed to only store dmesg dumps. */ + if (type != PSTORE_TYPE_DMESG) + return -EINVAL; + + /* Out of the various dmesg dump types, ramoops is currently designed + * to only store crash logs, rather than storing general kernel logs. + */ + if (reason != KMSG_DUMP_OOPS && + reason != KMSG_DUMP_PANIC) + return -EINVAL; + + /* Skip Oopes when configured to do so. */ + if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops) + return -EINVAL; + + /* Explicitly only take the first part of any new crash. + * If our buffer is larger than kmsg_bytes, this can never happen, + * and if our buffer is smaller than kmsg_bytes, we don't want the + * report split across multiple records. + */ + if (part != 1) + return -ENOSPC; + + hlen = ramoops_write_kmsg_hdr(prz); + if (size + hlen > prz->buffer_size) + size = prz->buffer_size - hlen; + persistent_ram_write(prz, cxt->pstore.buf, size); + + cxt->count = (cxt->count + 1) % cxt->max_count; + + return 0; +} + +static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, + struct pstore_info *psi) +{ + struct ramoops_context *cxt = psi->data; + + if (id >= cxt->max_count) + return -EINVAL; + + persistent_ram_free_old(cxt->przs[id]); + + return 0; +} + +static struct ramoops_context oops_cxt = { + .pstore = { + .owner = THIS_MODULE, + .name = "ramoops", + .open = ramoops_pstore_open, + .read = ramoops_pstore_read, + .write = ramoops_pstore_write, + .erase = ramoops_pstore_erase, + }, +}; + +static int __init ramoops_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct ramoops_platform_data *pdata = pdev->dev.platform_data; + struct ramoops_context *cxt = &oops_cxt; + int err = -EINVAL; + int i; + + /* Only a single ramoops area allowed at a time, so fail extra + * probes. + */ + if (cxt->max_count) + goto fail_out; + + if (!pdata->mem_size || !pdata->record_size) { + pr_err("The memory size and the record size must be " + "non-zero\n"); + goto fail_out; + } + + pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); + pdata->record_size = rounddown_pow_of_two(pdata->record_size); + + /* Check for the minimum memory size */ + if (pdata->mem_size < MIN_MEM_SIZE && + pdata->record_size < MIN_MEM_SIZE) { + pr_err("memory size too small, minimum is %lu\n", + MIN_MEM_SIZE); + goto fail_out; + } + + if (pdata->mem_size < pdata->record_size) { + pr_err("The memory size must be larger than the " + "records size\n"); + goto fail_out; + } + + cxt->max_count = pdata->mem_size / pdata->record_size; + cxt->count = 0; + cxt->size = pdata->mem_size; + cxt->phys_addr = pdata->mem_address; + cxt->record_size = pdata->record_size; + cxt->dump_oops = pdata->dump_oops; + cxt->ecc = pdata->ecc; + + cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_count, GFP_KERNEL); + if (!cxt->przs) { + err = -ENOMEM; + dev_err(dev, "failed to initialize a prz array\n"); + goto fail_out; + } + + for (i = 0; i < cxt->max_count; i++) { + size_t sz = cxt->record_size; + phys_addr_t start = cxt->phys_addr + sz * i; + + cxt->przs[i] = persistent_ram_new(start, sz, cxt->ecc); + if (IS_ERR(cxt->przs[i])) { + err = PTR_ERR(cxt->przs[i]); + dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", + sz, (unsigned long long)start, err); + goto fail_przs; + } + } + + cxt->pstore.data = cxt; + cxt->pstore.bufsize = cxt->przs[0]->buffer_size; + cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); + spin_lock_init(&cxt->pstore.buf_lock); + if (!cxt->pstore.buf) { + pr_err("cannot allocate pstore buffer\n"); + goto fail_clear; + } + + err = pstore_register(&cxt->pstore); + if (err) { + pr_err("registering with pstore failed\n"); + goto fail_buf; + } + + /* + * Update the module parameter variables as well so they are visible + * through /sys/module/ramoops/parameters/ + */ + mem_size = pdata->mem_size; + mem_address = pdata->mem_address; + record_size = pdata->record_size; + dump_oops = pdata->dump_oops; + + pr_info("attached 0x%lx@0x%llx (%ux0x%zx), ecc: %s\n", + cxt->size, (unsigned long long)cxt->phys_addr, + cxt->max_count, cxt->record_size, + ramoops_ecc ? "on" : "off"); + + return 0; + +fail_buf: + kfree(cxt->pstore.buf); +fail_clear: + cxt->pstore.bufsize = 0; + cxt->max_count = 0; +fail_przs: + for (i = 0; cxt->przs[i]; i++) + persistent_ram_free(cxt->przs[i]); + kfree(cxt->przs); +fail_out: + return err; +} + +static int __exit ramoops_remove(struct platform_device *pdev) +{ +#if 0 + /* TODO(kees): We cannot unload ramoops since pstore doesn't support + * unregistering yet. + */ + struct ramoops_context *cxt = &oops_cxt; + + iounmap(cxt->virt_addr); + release_mem_region(cxt->phys_addr, cxt->size); + cxt->max_count = 0; + + /* TODO(kees): When pstore supports unregistering, call it here. */ + kfree(cxt->pstore.buf); + cxt->pstore.bufsize = 0; + + return 0; +#endif + return -EBUSY; +} + +static struct platform_driver ramoops_driver = { + .remove = __exit_p(ramoops_remove), + .driver = { + .name = "ramoops", + .owner = THIS_MODULE, + }, +}; + +static int __init ramoops_init(void) +{ + int ret; + ret = platform_driver_probe(&ramoops_driver, ramoops_probe); + if (ret == -ENODEV) { + /* + * If we didn't find a platform device, we use module parameters + * building platform data on the fly. + */ + pr_info("platform device not found, using module parameters\n"); + dummy_data = kzalloc(sizeof(struct ramoops_platform_data), + GFP_KERNEL); + if (!dummy_data) + return -ENOMEM; + dummy_data->mem_size = mem_size; + dummy_data->mem_address = mem_address; + dummy_data->record_size = record_size; + dummy_data->dump_oops = dump_oops; + dummy_data->ecc = ramoops_ecc; + dummy = platform_create_bundle(&ramoops_driver, ramoops_probe, + NULL, 0, dummy_data, + sizeof(struct ramoops_platform_data)); + + if (IS_ERR(dummy)) + ret = PTR_ERR(dummy); + else + ret = 0; + } + + return ret; +} + +static void __exit ramoops_exit(void) +{ + platform_driver_unregister(&ramoops_driver); + kfree(dummy_data); +} + +module_init(ramoops_init); +module_exit(ramoops_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Stornelli <marco.stornelli@gmail.com>"); +MODULE_DESCRIPTION("RAM Oops/Panic logger/driver"); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c new file mode 100644 index 0000000..31f8d18 --- /dev/null +++ b/fs/pstore/ram_core.c @@ -0,0 +1,532 @@ +/* + * Copyright (C) 2012 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/device.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/list.h> +#include <linux/memblock.h> +#include <linux/rslib.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pstore_ram.h> +#include <asm/page.h> + +struct persistent_ram_buffer { + uint32_t sig; + atomic_t start; + atomic_t size; + uint8_t data[0]; +}; + +#define PERSISTENT_RAM_SIG (0x43474244) /* DBGC */ + +static __initdata LIST_HEAD(persistent_ram_list); + +static inline size_t buffer_size(struct persistent_ram_zone *prz) +{ + return atomic_read(&prz->buffer->size); +} + +static inline size_t buffer_start(struct persistent_ram_zone *prz) +{ + return atomic_read(&prz->buffer->start); +} + +/* increase and wrap the start pointer, returning the old value */ +static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) +{ + int old; + int new; + + do { + old = atomic_read(&prz->buffer->start); + new = old + a; + while (unlikely(new > prz->buffer_size)) + new -= prz->buffer_size; + } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); + + return old; +} + +/* increase the size counter until it hits the max size */ +static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a) +{ + size_t old; + size_t new; + + if (atomic_read(&prz->buffer->size) == prz->buffer_size) + return; + + do { + old = atomic_read(&prz->buffer->size); + new = old + a; + if (new > prz->buffer_size) + new = prz->buffer_size; + } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); +} + +static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, + uint8_t *data, size_t len, uint8_t *ecc) +{ + int i; + uint16_t par[prz->ecc_size]; + + /* Initialize the parity buffer */ + memset(par, 0, sizeof(par)); + encode_rs8(prz->rs_decoder, data, len, par, 0); + for (i = 0; i < prz->ecc_size; i++) + ecc[i] = par[i]; +} + +static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz, + void *data, size_t len, uint8_t *ecc) +{ + int i; + uint16_t par[prz->ecc_size]; + + for (i = 0; i < prz->ecc_size; i++) + par[i] = ecc[i]; + return decode_rs8(prz->rs_decoder, data, par, len, + NULL, 0, NULL, 0, NULL); +} + +static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz, + unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + uint8_t *buffer_end = buffer->data + prz->buffer_size; + uint8_t *block; + uint8_t *par; + int ecc_block_size = prz->ecc_block_size; + int ecc_size = prz->ecc_size; + int size = prz->ecc_block_size; + + if (!prz->ecc) + return; + + block = buffer->data + (start & ~(ecc_block_size - 1)); + par = prz->par_buffer + (start / ecc_block_size) * prz->ecc_size; + + do { + if (block + ecc_block_size > buffer_end) + size = buffer_end - block; + persistent_ram_encode_rs8(prz, block, size, par); + block += ecc_block_size; + par += ecc_size; + } while (block < buffer->data + start + count); +} + +static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + + if (!prz->ecc) + return; + + persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer), + prz->par_header); +} + +static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + uint8_t *block; + uint8_t *par; + + if (!prz->ecc) + return; + + block = buffer->data; + par = prz->par_buffer; + while (block < buffer->data + buffer_size(prz)) { + int numerr; + int size = prz->ecc_block_size; + if (block + size > buffer->data + prz->buffer_size) + size = buffer->data + prz->buffer_size - block; + numerr = persistent_ram_decode_rs8(prz, block, size, par); + if (numerr > 0) { + pr_devel("persistent_ram: error in block %p, %d\n", + block, numerr); + prz->corrected_bytes += numerr; + } else if (numerr < 0) { + pr_devel("persistent_ram: uncorrectable error in block %p\n", + block); + prz->bad_blocks++; + } + block += prz->ecc_block_size; + par += prz->ecc_size; + } +} + +static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, + size_t buffer_size) +{ + int numerr; + struct persistent_ram_buffer *buffer = prz->buffer; + int ecc_blocks; + + if (!prz->ecc) + return 0; + + prz->ecc_block_size = 128; + prz->ecc_size = 16; + prz->ecc_symsize = 8; + prz->ecc_poly = 0x11d; + + ecc_blocks = DIV_ROUND_UP(prz->buffer_size, prz->ecc_block_size); + prz->buffer_size -= (ecc_blocks + 1) * prz->ecc_size; + + if (prz->buffer_size > buffer_size) { + pr_err("persistent_ram: invalid size %zu, non-ecc datasize %zu\n", + buffer_size, prz->buffer_size); + return -EINVAL; + } + + prz->par_buffer = buffer->data + prz->buffer_size; + prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size; + + /* + * first consecutive root is 0 + * primitive element to generate roots = 1 + */ + prz->rs_decoder = init_rs(prz->ecc_symsize, prz->ecc_poly, 0, 1, + prz->ecc_size); + if (prz->rs_decoder == NULL) { + pr_info("persistent_ram: init_rs failed\n"); + return -EINVAL; + } + + prz->corrected_bytes = 0; + prz->bad_blocks = 0; + + numerr = persistent_ram_decode_rs8(prz, buffer, sizeof(*buffer), + prz->par_header); + if (numerr > 0) { + pr_info("persistent_ram: error in header, %d\n", numerr); + prz->corrected_bytes += numerr; + } else if (numerr < 0) { + pr_info("persistent_ram: uncorrectable error in header\n"); + prz->bad_blocks++; + } + + return 0; +} + +ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, + char *str, size_t len) +{ + ssize_t ret; + + if (prz->corrected_bytes || prz->bad_blocks) + ret = snprintf(str, len, "" + "\n%d Corrected bytes, %d unrecoverable blocks\n", + prz->corrected_bytes, prz->bad_blocks); + else + ret = snprintf(str, len, "\nNo errors detected\n"); + + return ret; +} + +static void notrace persistent_ram_update(struct persistent_ram_zone *prz, + const void *s, unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + memcpy(buffer->data + start, s, count); + persistent_ram_update_ecc(prz, start, count); +} + +static void __init +persistent_ram_save_old(struct persistent_ram_zone *prz) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + size_t size = buffer_size(prz); + size_t start = buffer_start(prz); + char *dest; + + persistent_ram_ecc_old(prz); + + dest = kmalloc(size, GFP_KERNEL); + if (dest == NULL) { + pr_err("persistent_ram: failed to allocate buffer\n"); + return; + } + + prz->old_log = dest; + prz->old_log_size = size; + memcpy(prz->old_log, &buffer->data[start], size - start); + memcpy(prz->old_log + size - start, &buffer->data[0], start); +} + +int notrace persistent_ram_write(struct persistent_ram_zone *prz, + const void *s, unsigned int count) +{ + int rem; + int c = count; + size_t start; + + if (unlikely(c > prz->buffer_size)) { + s += c - prz->buffer_size; + c = prz->buffer_size; + } + + buffer_size_add(prz, c); + + start = buffer_start_add(prz, c); + + rem = prz->buffer_size - start; + if (unlikely(rem < c)) { + persistent_ram_update(prz, s, start, rem); + s += rem; + c -= rem; + start = 0; + } + persistent_ram_update(prz, s, start, c); + + persistent_ram_update_header_ecc(prz); + + return count; +} + +size_t persistent_ram_old_size(struct persistent_ram_zone *prz) +{ + return prz->old_log_size; +} + +void *persistent_ram_old(struct persistent_ram_zone *prz) +{ + return prz->old_log; +} + +void persistent_ram_free_old(struct persistent_ram_zone *prz) +{ + kfree(prz->old_log); + prz->old_log = NULL; + prz->old_log_size = 0; +} + +static void *persistent_ram_vmap(phys_addr_t start, size_t size) +{ + struct page **pages; + phys_addr_t page_start; + unsigned int page_count; + pgprot_t prot; + unsigned int i; + void *vaddr; + + page_start = start - offset_in_page(start); + page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE); + + prot = pgprot_noncached(PAGE_KERNEL); + + pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL); + if (!pages) { + pr_err("%s: Failed to allocate array for %u pages\n", __func__, + page_count); + return NULL; + } + + for (i = 0; i < page_count; i++) { + phys_addr_t addr = page_start + i * PAGE_SIZE; + pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + } + vaddr = vmap(pages, page_count, VM_MAP, prot); + kfree(pages); + + return vaddr; +} + +static void *persistent_ram_iomap(phys_addr_t start, size_t size) +{ + if (!request_mem_region(start, size, "persistent_ram")) { + pr_err("request mem region (0x%llx@0x%llx) failed\n", + (unsigned long long)size, (unsigned long long)start); + return NULL; + } + + return ioremap(start, size); +} + +static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, + struct persistent_ram_zone *prz) +{ + prz->paddr = start; + prz->size = size; + + if (pfn_valid(start >> PAGE_SHIFT)) + prz->vaddr = persistent_ram_vmap(start, size); + else + prz->vaddr = persistent_ram_iomap(start, size); + + if (!prz->vaddr) { + pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__, + (unsigned long long)size, (unsigned long long)start); + return -ENOMEM; + } + + prz->buffer = prz->vaddr + offset_in_page(start); + prz->buffer_size = size - sizeof(struct persistent_ram_buffer); + + return 0; +} + +static int __init persistent_ram_post_init(struct persistent_ram_zone *prz, bool ecc) +{ + int ret; + + prz->ecc = ecc; + + ret = persistent_ram_init_ecc(prz, prz->buffer_size); + if (ret) + return ret; + + if (prz->buffer->sig == PERSISTENT_RAM_SIG) { + if (buffer_size(prz) > prz->buffer_size || + buffer_start(prz) > buffer_size(prz)) + pr_info("persistent_ram: found existing invalid buffer," + " size %zu, start %zu\n", + buffer_size(prz), buffer_start(prz)); + else { + pr_info("persistent_ram: found existing buffer," + " size %zu, start %zu\n", + buffer_size(prz), buffer_start(prz)); + persistent_ram_save_old(prz); + } + } else { + pr_info("persistent_ram: no valid data in buffer" + " (sig = 0x%08x)\n", prz->buffer->sig); + } + + prz->buffer->sig = PERSISTENT_RAM_SIG; + atomic_set(&prz->buffer->start, 0); + atomic_set(&prz->buffer->size, 0); + + return 0; +} + +void persistent_ram_free(struct persistent_ram_zone *prz) +{ + if (pfn_valid(prz->paddr >> PAGE_SHIFT)) { + vunmap(prz->vaddr); + } else { + iounmap(prz->vaddr); + release_mem_region(prz->paddr, prz->size); + } + persistent_ram_free_old(prz); + kfree(prz); +} + +struct persistent_ram_zone * __init persistent_ram_new(phys_addr_t start, + size_t size, + bool ecc) +{ + struct persistent_ram_zone *prz; + int ret = -ENOMEM; + + prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL); + if (!prz) { + pr_err("persistent_ram: failed to allocate persistent ram zone\n"); + goto err; + } + + ret = persistent_ram_buffer_map(start, size, prz); + if (ret) + goto err; + + persistent_ram_post_init(prz, ecc); + persistent_ram_update_header_ecc(prz); + + return prz; +err: + kfree(prz); + return ERR_PTR(ret); +} + +#ifndef MODULE +static int __init persistent_ram_buffer_init(const char *name, + struct persistent_ram_zone *prz) +{ + int i; + struct persistent_ram *ram; + struct persistent_ram_descriptor *desc; + phys_addr_t start; + + list_for_each_entry(ram, &persistent_ram_list, node) { + start = ram->start; + for (i = 0; i < ram->num_descs; i++) { + desc = &ram->descs[i]; + if (!strcmp(desc->name, name)) + return persistent_ram_buffer_map(start, + desc->size, prz); + start += desc->size; + } + } + + return -EINVAL; +} + +static __init +struct persistent_ram_zone *__persistent_ram_init(struct device *dev, bool ecc) +{ + struct persistent_ram_zone *prz; + int ret = -ENOMEM; + + prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL); + if (!prz) { + pr_err("persistent_ram: failed to allocate persistent ram zone\n"); + goto err; + } + + ret = persistent_ram_buffer_init(dev_name(dev), prz); + if (ret) { + pr_err("persistent_ram: failed to initialize buffer\n"); + goto err; + } + + persistent_ram_post_init(prz, ecc); + + return prz; +err: + kfree(prz); + return ERR_PTR(ret); +} + +struct persistent_ram_zone * __init +persistent_ram_init_ringbuffer(struct device *dev, bool ecc) +{ + return __persistent_ram_init(dev, ecc); +} + +int __init persistent_ram_early_init(struct persistent_ram *ram) +{ + int ret; + + ret = memblock_reserve(ram->start, ram->size); + if (ret) { + pr_err("Failed to reserve persistent memory from %08lx-%08lx\n", + (long)ram->start, (long)(ram->start + ram->size - 1)); + return ret; + } + + list_add_tail(&ram->node, &persistent_ram_list); + + pr_info("Initialized persistent memory from %08lx-%08lx\n", + (long)ram->start, (long)(ram->start + ram->size - 1)); + + return 0; +} +#endif diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index d69a1d1..10cbe84 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -116,15 +116,15 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > + * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock > * dqio_mutex + * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem > * dqptr_sem. But filesystem has to count with the fact that functions such as * dquot_alloc_space() acquire dqptr_sem and they usually have to be called * from inside a transaction to keep filesystem consistency after a crash. Also * filesystems usually want to do some IO on dquot from ->mark_dirty which is * called with dqptr_sem held. - * i_mutex on quota files is special (it's below dqio_mutex) */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); @@ -638,7 +638,7 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) dqstats_inc(DQST_SYNCS); mutex_unlock(&dqopt->dqonoff_mutex); - if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) + if (!wait || (dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return 0; /* This is not very clever (and fast) but currently I don't know about @@ -652,18 +652,17 @@ int dquot_quota_sync(struct super_block *sb, int type, int wait) * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + mutex_lock(&dqopt->dqonoff_mutex); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, - I_MUTEX_QUOTA); - truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); - mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); + mutex_lock(&dqopt->files[cnt]->i_mutex); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + mutex_unlock(&dqopt->files[cnt]->i_mutex); } - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + mutex_unlock(&dqopt->dqonoff_mutex); return 0; } @@ -907,14 +906,14 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode->i_lock); continue; } -#ifdef CONFIG_QUOTA_DEBUG - if (unlikely(inode_get_rsv_space(inode) > 0)) - reserved = 1; -#endif __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&inode_sb_list_lock); +#ifdef CONFIG_QUOTA_DEBUG + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; +#endif iput(old_inode); __dquot_initialize(inode, type); @@ -2037,8 +2036,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_loaded(sb, cnt)) { - mutex_lock_nested(&toputinode[cnt]->i_mutex, - I_MUTEX_QUOTA); + mutex_lock(&toputinode[cnt]->i_mutex); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, @@ -2133,7 +2131,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + mutex_lock(&inode->i_mutex); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; @@ -2180,7 +2178,7 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + mutex_lock(&inode->i_mutex); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 494c315..59d0687 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -76,14 +76,14 @@ void reiserfs_evict_inode(struct inode *inode) ; } out: - end_writeback(inode); /* note this must go after the journal_end to prevent deadlock */ + clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ dquot_drop(inode); inode->i_blocks = 0; reiserfs_write_unlock_once(inode->i_sb, depth); return; no_delete: - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 8b7616e..c07b7d7 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2270,7 +2270,6 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -2302,16 +2301,13 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) { - mutex_unlock(&inode->i_mutex); + if (len == towrite) return err; - } if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); return len - towrite; } @@ -57,12 +57,13 @@ EXPORT_SYMBOL(vfs_getattr); int vfs_fstat(unsigned int fd, struct kstat *stat) { - struct file *f = fget(fd); + int fput_needed; + struct file *f = fget_light(fd, &fput_needed); int error = -EBADF; if (f) { error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); - fput(f); + fput_light(f, fput_needed); } return error; } @@ -137,8 +138,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = old_encode_dev(stat->rdev); #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -190,24 +191,32 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat #endif /* __ARCH_WANT_OLD_STAT */ +#if BITS_PER_LONG == 32 +# define choose_32_64(a,b) a +#else +# define choose_32_64(a,b) b +#endif + +#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x) +#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) + +#ifndef INIT_STRUCT_STAT_PADDING +# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) +#endif + static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; -#if BITS_PER_LONG == 32 - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) return -EOVERFLOW; -#else - if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev)) +#if BITS_PER_LONG == 32 + if (stat->size > MAX_NON_LFS) return -EOVERFLOW; #endif - memset(&tmp, 0, sizeof(tmp)); -#if BITS_PER_LONG == 32 - tmp.st_dev = old_encode_dev(stat->dev); -#else - tmp.st_dev = new_encode_dev(stat->dev); -#endif + INIT_STRUCT_STAT_PADDING(tmp); + tmp.st_dev = encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -215,17 +224,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); -#if BITS_PER_LONG == 32 - tmp.st_rdev = old_encode_dev(stat->rdev); -#else - tmp.st_rdev = new_encode_dev(stat->rdev); -#endif -#if BITS_PER_LONG == 32 - if (stat->size > MAX_NON_LFS) - return -EOVERFLOW; -#endif + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); + tmp.st_rdev = encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -327,11 +328,15 @@ SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, /* ---------- LFS-64 ----------- */ #ifdef __ARCH_WANT_STAT64 +#ifndef INIT_STRUCT_STAT64_PADDING +# define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st)) +#endif + static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) { struct stat64 tmp; - memset(&tmp, 0, sizeof(struct stat64)); + INIT_STRUCT_STAT64_PADDING(tmp); #ifdef CONFIG_MIPS /* mips has weird padding, so we don't get 64 bits there */ if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev)) @@ -350,8 +355,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) #endif tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; - tmp.st_uid = stat->uid; - tmp.st_gid = stat->gid; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.st_atime = stat->atime.tv_sec; tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime = stat->mtime.tv_sec; diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 35a36d3..e6bb9b2 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -132,6 +132,24 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); } +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +/* Test for attributes that want to ignore lockdep for read-locking */ +static bool ignore_lockdep(struct sysfs_dirent *sd) +{ + return sysfs_type(sd) == SYSFS_KOBJ_ATTR && + sd->s_attr.attr->ignore_lockdep; +} + +#else + +static inline bool ignore_lockdep(struct sysfs_dirent *sd) +{ + return true; +} + +#endif + /** * sysfs_get_active - get an active reference to sysfs_dirent * @sd: sysfs_dirent to get an active reference to @@ -155,15 +173,17 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) return NULL; t = atomic_cmpxchg(&sd->s_active, v, v + 1); - if (likely(t == v)) { - rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); - return sd; - } + if (likely(t == v)) + break; if (t < 0) return NULL; cpu_relax(); } + + if (likely(!ignore_lockdep(sd))) + rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); + return sd; } /** @@ -180,7 +200,8 @@ void sysfs_put_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return; - rwsem_release(&sd->dep_map, 1, _RET_IP_); + if (likely(!ignore_lockdep(sd))) + rwsem_release(&sd->dep_map, 1, _RET_IP_); v = atomic_dec_return(&sd->s_active); if (likely(v != SD_DEACTIVATED_BIAS)) return; @@ -858,7 +879,6 @@ int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, const void *new_ns, const char *new_name) { - const char *dup_name = NULL; int error; mutex_lock(&sysfs_mutex); @@ -875,11 +895,11 @@ int sysfs_rename(struct sysfs_dirent *sd, /* rename sysfs_dirent */ if (strcmp(sd->s_name, new_name) != 0) { error = -ENOMEM; - new_name = dup_name = kstrdup(new_name, GFP_KERNEL); + new_name = kstrdup(new_name, GFP_KERNEL); if (!new_name) goto out; - dup_name = sd->s_name; + kfree(sd->s_name); sd->s_name = new_name; } @@ -895,7 +915,6 @@ int sysfs_rename(struct sysfs_dirent *sd, error = 0; out: mutex_unlock(&sysfs_mutex); - kfree(dup_name); return error; } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index feb2d69..0ce3ccf 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -62,8 +62,8 @@ static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) /* assign default attributes */ iattrs->ia_mode = sd->s_mode; - iattrs->ia_uid = 0; - iattrs->ia_gid = 0; + iattrs->ia_uid = GLOBAL_ROOT_UID; + iattrs->ia_gid = GLOBAL_ROOT_GID; iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; return attrs; @@ -310,7 +310,7 @@ void sysfs_evict_inode(struct inode *inode) struct sysfs_dirent *sd = inode->i_private; truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); sysfs_put(sd); } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 3da5ce2..08d0b25 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -316,7 +316,7 @@ static void sysv_evict_inode(struct inode *inode) sysv_truncate(inode); } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (!inode->i_nlink) sysv_free_inode(inode); } diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index f8b0160..ba66d508 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -11,12 +11,6 @@ config UBIFS_FS help UBIFS is a file system for flash devices which works on top of UBI. -config UBIFS_FS_XATTR - bool "Extended attributes support" - depends on UBIFS_FS - help - This option enables support of extended attributes. - config UBIFS_FS_ADVANCED_COMPR bool "Advanced compression options" depends on UBIFS_FS @@ -41,20 +35,3 @@ config UBIFS_FS_ZLIB default y help Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. - -# Debugging-related stuff -config UBIFS_FS_DEBUG - bool "Enable debugging support" - depends on UBIFS_FS - select DEBUG_FS - select KALLSYMS - help - This option enables UBIFS debugging support. It makes sure various - assertions, self-checks, debugging messages and test modes are compiled - in (this all is compiled out otherwise). Assertions are light-weight - and this option also enables them. Self-checks, debugging messages and - test modes are switched off by default. Thus, it is safe and actually - recommended to have debugging support enabled, and it should not slow - down UBIFS. You can then further enable / disable individual debugging - features using UBIFS module parameters and the corresponding sysfs - interfaces. diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 80e93c3..2c6f0cb 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -3,7 +3,4 @@ obj-$(CONFIG_UBIFS_FS) += ubifs.o ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o -ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o - -ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o -ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o +ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index fb3b5c8..8eda717 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -496,7 +496,9 @@ int ubifs_gc_should_commit(struct ubifs_info *c) return ret; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** * struct idx_node - hold index nodes during index tree traversal. @@ -714,14 +716,14 @@ out: return 0; out_dump: - dbg_err("dumping index node (iip=%d)", i->iip); - dbg_dump_node(c, idx); + ubifs_err("dumping index node (iip=%d)", i->iip); + ubifs_dump_node(c, idx); list_del(&i->list); kfree(i); if (!list_empty(&list)) { i = list_entry(list.prev, struct idx_node, list); - dbg_err("dumping parent index node"); - dbg_dump_node(c, &i->idx); + ubifs_err("dumping parent index node"); + ubifs_dump_node(c, &i->idx); } out_free: while (!list_empty(&list)) { @@ -734,5 +736,3 @@ out_free: err = -EINVAL; return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 1934084..685a837 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -34,8 +34,6 @@ #include <linux/random.h> #include "ubifs.h" -#ifdef CONFIG_UBIFS_FS_DEBUG - static DEFINE_SPINLOCK(dbg_lock); static const char *get_key_fmt(int fmt) @@ -232,7 +230,7 @@ static void dump_ch(const struct ubifs_ch *ch) printk(KERN_ERR "\tlen %u\n", le32_to_cpu(ch->len)); } -void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) +void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); struct qstr nm = { .name = NULL }; @@ -300,7 +298,7 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) kfree(pdent); } -void dbg_dump_node(const struct ubifs_info *c, const void *node) +void ubifs_dump_node(const struct ubifs_info *c, const void *node) { int i, n; union ubifs_key key; @@ -603,7 +601,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) spin_unlock(&dbg_lock); } -void dbg_dump_budget_req(const struct ubifs_budget_req *req) +void ubifs_dump_budget_req(const struct ubifs_budget_req *req) { spin_lock(&dbg_lock); printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n", @@ -620,7 +618,7 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req) spin_unlock(&dbg_lock); } -void dbg_dump_lstats(const struct ubifs_lp_stats *lst) +void ubifs_dump_lstats(const struct ubifs_lp_stats *lst) { spin_lock(&dbg_lock); printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, " @@ -634,7 +632,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst) spin_unlock(&dbg_lock); } -void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) +void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) { int i; struct rb_node *rb; @@ -707,7 +705,7 @@ out_unlock: spin_unlock(&c->space_lock); } -void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) +void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) { int i, spc, dark = 0, dead = 0; struct rb_node *rb; @@ -801,7 +799,7 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) printk(KERN_CONT ")\n"); } -void dbg_dump_lprops(struct ubifs_info *c) +void ubifs_dump_lprops(struct ubifs_info *c) { int lnum, err; struct ubifs_lprops lp; @@ -810,20 +808,20 @@ void dbg_dump_lprops(struct ubifs_info *c) printk(KERN_ERR "(pid %d) start dumping LEB properties\n", current->pid); ubifs_get_lp_stats(c, &lst); - dbg_dump_lstats(&lst); + ubifs_dump_lstats(&lst); for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { err = ubifs_read_one_lp(c, lnum, &lp); if (err) ubifs_err("cannot read lprops for LEB %d", lnum); - dbg_dump_lprop(c, &lp); + ubifs_dump_lprop(c, &lp); } printk(KERN_ERR "(pid %d) finish dumping LEB properties\n", current->pid); } -void dbg_dump_lpt_info(struct ubifs_info *c) +void ubifs_dump_lpt_info(struct ubifs_info *c) { int i; @@ -862,8 +860,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } -void dbg_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs) +void ubifs_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) { struct ubifs_scan_node *snod; @@ -874,11 +872,11 @@ void dbg_dump_sleb(const struct ubifs_info *c, cond_resched(); printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum, snod->offs, snod->len); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); } } -void dbg_dump_leb(const struct ubifs_info *c, int lnum) +void ubifs_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; @@ -909,7 +907,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) cond_resched(); printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum, snod->offs, snod->len); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); } printk(KERN_ERR "(pid %d) finish dumping LEB %d\n", @@ -921,8 +919,8 @@ out: return; } -void dbg_dump_znode(const struct ubifs_info *c, - const struct ubifs_znode *znode) +void ubifs_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) { int n; const struct ubifs_zbranch *zbr; @@ -965,7 +963,7 @@ void dbg_dump_znode(const struct ubifs_info *c, spin_unlock(&dbg_lock); } -void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) +void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) { int i; @@ -981,8 +979,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid); } -void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, - struct ubifs_nnode *parent, int iip) +void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) { int i; @@ -999,7 +997,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, } } -void dbg_dump_tnc(struct ubifs_info *c) +void ubifs_dump_tnc(struct ubifs_info *c) { struct ubifs_znode *znode; int level; @@ -1014,7 +1012,7 @@ void dbg_dump_tnc(struct ubifs_info *c) level = znode->level; printk(KERN_ERR "== Level %d ==\n", level); } - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); } printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid); @@ -1023,18 +1021,18 @@ void dbg_dump_tnc(struct ubifs_info *c) static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, void *priv) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return 0; } /** - * dbg_dump_index - dump the on-flash index. + * ubifs_dump_index - dump the on-flash index. * @c: UBIFS file-system description object * - * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()' + * This function dumps whole UBIFS indexing B-tree, unlike 'ubifs_dump_tnc()' * which dumps only in-memory znodes and does not read znodes which from flash. */ -void dbg_dump_index(struct ubifs_info *c) +void ubifs_dump_index(struct ubifs_info *c) { dbg_walk_index(c, NULL, dump_znode, NULL); } @@ -1120,15 +1118,15 @@ int dbg_check_space_info(struct ubifs_info *c) out: ubifs_msg("saved lprops statistics dump"); - dbg_dump_lstats(&d->saved_lst); + ubifs_dump_lstats(&d->saved_lst); ubifs_msg("saved budgeting info dump"); - dbg_dump_budg(c, &d->saved_bi); + ubifs_dump_budg(c, &d->saved_bi); ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt); ubifs_msg("current lprops statistics dump"); ubifs_get_lp_stats(c, &lst); - dbg_dump_lstats(&lst); + ubifs_dump_lstats(&lst); ubifs_msg("current budgeting info dump"); - dbg_dump_budg(c, &c->bi); + ubifs_dump_budg(c, &c->bi); dump_stack(); return -EINVAL; } @@ -1160,7 +1158,7 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) "is clean", ui->ui_size, ui->synced_i_size); ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, inode->i_mode, i_size_read(inode)); - dbg_dump_stack(); + dump_stack(); err = -EINVAL; } spin_unlock(&ui->ui_lock); @@ -1223,14 +1221,14 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) "but calculated size is %llu", dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); - dbg_dump_inode(c, dir); + ubifs_dump_inode(c, dir); dump_stack(); return -EINVAL; } if (dir->i_nlink != nlink) { ubifs_err("directory inode %lu has nlink %u, but calculated " "nlink is %u", dir->i_ino, dir->i_nlink, nlink); - dbg_dump_inode(c, dir); + ubifs_dump_inode(c, dir); dump_stack(); return -EINVAL; } @@ -1287,25 +1285,25 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, err = 1; key_read(c, &dent1->key, &key); if (keys_cmp(c, &zbr1->key, &key)) { - dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, dbg_snprintf_key(c, &key, key_buf, - DBG_KEY_BUF_LEN)); - dbg_err("but it should have key %s according to tnc", - dbg_snprintf_key(c, &zbr1->key, key_buf, - DBG_KEY_BUF_LEN)); - dbg_dump_node(c, dent1); + ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_err("but it should have key %s according to tnc", + dbg_snprintf_key(c, &zbr1->key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_dump_node(c, dent1); goto out_free; } key_read(c, &dent2->key, &key); if (keys_cmp(c, &zbr2->key, &key)) { - dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, - zbr1->offs, dbg_snprintf_key(c, &key, key_buf, - DBG_KEY_BUF_LEN)); - dbg_err("but it should have key %s according to tnc", - dbg_snprintf_key(c, &zbr2->key, key_buf, - DBG_KEY_BUF_LEN)); - dbg_dump_node(c, dent2); + ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, dbg_snprintf_key(c, &key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_err("but it should have key %s according to tnc", + dbg_snprintf_key(c, &zbr2->key, key_buf, + DBG_KEY_BUF_LEN)); + ubifs_dump_node(c, dent2); goto out_free; } @@ -1318,15 +1316,15 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, goto out_free; } if (cmp == 0 && nlen1 == nlen2) - dbg_err("2 xent/dent nodes with the same name"); + ubifs_err("2 xent/dent nodes with the same name"); else - dbg_err("bad order of colliding key %s", - dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); + ubifs_err("bad order of colliding key %s", + dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); - dbg_dump_node(c, dent1); + ubifs_dump_node(c, dent1); ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); - dbg_dump_node(c, dent2); + ubifs_dump_node(c, dent2); out_free: kfree(dent2); @@ -1529,10 +1527,10 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) out: ubifs_err("failed, error %d", err); ubifs_msg("dump of the znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zp) { ubifs_msg("dump of the parent znode"); - dbg_dump_znode(c, zp); + ubifs_dump_znode(c, zp); } dump_stack(); return -EINVAL; @@ -1599,9 +1597,9 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) return err; if (err) { ubifs_msg("first znode"); - dbg_dump_znode(c, prev); + ubifs_dump_znode(c, prev); ubifs_msg("second znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return -EINVAL; } } @@ -1690,7 +1688,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, if (err) { ubifs_err("znode checking function returned " "error %d", err); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); goto out_dump; } } @@ -1758,7 +1756,7 @@ out_dump: else zbr = &c->zroot; ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); out_unlock: mutex_unlock(&c->tnc_mutex); return err; @@ -2194,7 +2192,7 @@ out: out_dump: ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_dump_node(c, node); + ubifs_dump_node(c, node); out_free: kfree(node); return err; @@ -2352,7 +2350,7 @@ out_dump: ubifs_msg("dump of the inode %lu sitting in LEB %d:%d", (unsigned long)fscki->inum, zbr->lnum, zbr->offs); - dbg_dump_node(c, ino); + ubifs_dump_node(c, ino); kfree(ino); return -EINVAL; } @@ -2423,12 +2421,12 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_DATA_NODE) { ubifs_err("bad node type %d", sa->type); - dbg_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node); return -EINVAL; } if (sb->type != UBIFS_DATA_NODE) { ubifs_err("bad node type %d", sb->type); - dbg_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node); return -EINVAL; } @@ -2459,8 +2457,8 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) return 0; error_dump: - dbg_dump_node(c, sa->node); - dbg_dump_node(c, sb->node); + ubifs_dump_node(c, sa->node); + ubifs_dump_node(c, sb->node); return -EINVAL; } @@ -2491,13 +2489,13 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && sa->type != UBIFS_XENT_NODE) { ubifs_err("bad node type %d", sa->type); - dbg_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node); return -EINVAL; } if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && sa->type != UBIFS_XENT_NODE) { ubifs_err("bad node type %d", sb->type); - dbg_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node); return -EINVAL; } @@ -2547,9 +2545,9 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) error_dump: ubifs_msg("dumping first node"); - dbg_dump_node(c, sa->node); + ubifs_dump_node(c, sa->node); ubifs_msg("dumping second node"); - dbg_dump_node(c, sb->node); + ubifs_dump_node(c, sb->node); return -EINVAL; return 0; } @@ -2678,7 +2676,7 @@ static void cut_data(const void *buf, unsigned int len) } int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, - int offs, int len, int dtype) + int offs, int len) { int err, failing; @@ -2688,7 +2686,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, failing = power_cut_emulated(c, lnum, 1); if (failing) cut_data(buf, len); - err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + err = ubi_leb_write(c->ubi, lnum, buf, offs, len); if (err) return err; if (failing) @@ -2697,7 +2695,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, } int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, - int len, int dtype) + int len) { int err; @@ -2705,7 +2703,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, return -EROFS; if (power_cut_emulated(c, lnum, 1)) return -EROFS; - err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + err = ubi_leb_change(c->ubi, lnum, buf, len); if (err) return err; if (power_cut_emulated(c, lnum, 1)) @@ -2729,7 +2727,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum) return 0; } -int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype) +int dbg_leb_map(struct ubifs_info *c, int lnum) { int err; @@ -2737,7 +2735,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype) return -EROFS; if (power_cut_emulated(c, lnum, 0)) return -EROFS; - err = ubi_leb_map(c->ubi, lnum, dtype); + err = ubi_leb_map(c->ubi, lnum); if (err) return err; if (power_cut_emulated(c, lnum, 0)) @@ -2857,16 +2855,16 @@ static ssize_t dfs_file_write(struct file *file, const char __user *u, * 'ubifs-debug' file-system instead. */ if (file->f_path.dentry == d->dfs_dump_lprops) { - dbg_dump_lprops(c); + ubifs_dump_lprops(c); return count; } if (file->f_path.dentry == d->dfs_dump_budg) { - dbg_dump_budg(c, &c->bi); + ubifs_dump_budg(c, &c->bi); return count; } if (file->f_path.dentry == d->dfs_dump_tnc) { mutex_lock(&c->tnc_mutex); - dbg_dump_tnc(c); + ubifs_dump_tnc(c); mutex_unlock(&c->tnc_mutex); return count; } @@ -3189,5 +3187,3 @@ void ubifs_debugging_exit(struct ubifs_info *c) { kfree(c->dbg); } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 9f71765..486a8e0 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -29,8 +29,6 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c, typedef int (*dbg_znode_callback)(struct ubifs_info *c, struct ubifs_znode *znode, void *priv); -#ifdef CONFIG_UBIFS_FS_DEBUG - /* * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. @@ -149,7 +147,7 @@ struct ubifs_global_debug_info { if (unlikely(!(expr))) { \ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ __func__, __LINE__, current->pid); \ - dbg_dump_stack(); \ + dump_stack(); \ } \ } while (0) @@ -161,12 +159,6 @@ struct ubifs_global_debug_info { } \ } while (0) -#define dbg_dump_stack() dump_stack() - -#define dbg_err(fmt, ...) do { \ - ubifs_err(fmt, ##__VA_ARGS__); \ -} while (0) - #define ubifs_dbg_msg(type, fmt, ...) \ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__) @@ -257,27 +249,27 @@ const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); const char *dbg_snprintf_key(const struct ubifs_info *c, const union ubifs_key *key, char *buffer, int len); -void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); -void dbg_dump_node(const struct ubifs_info *c, const void *node); -void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, - int offs); -void dbg_dump_budget_req(const struct ubifs_budget_req *req); -void dbg_dump_lstats(const struct ubifs_lp_stats *lst); -void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); -void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); -void dbg_dump_lprops(struct ubifs_info *c); -void dbg_dump_lpt_info(struct ubifs_info *c); -void dbg_dump_leb(const struct ubifs_info *c, int lnum); -void dbg_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs); -void dbg_dump_znode(const struct ubifs_info *c, - const struct ubifs_znode *znode); -void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); -void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, - struct ubifs_nnode *parent, int iip); -void dbg_dump_tnc(struct ubifs_info *c); -void dbg_dump_index(struct ubifs_info *c); -void dbg_dump_lpt_lebs(const struct ubifs_info *c); +void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode); +void ubifs_dump_node(const struct ubifs_info *c, const void *node); +void ubifs_dump_budget_req(const struct ubifs_budget_req *req); +void ubifs_dump_lstats(const struct ubifs_lp_stats *lst); +void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); +void ubifs_dump_lprop(const struct ubifs_info *c, + const struct ubifs_lprops *lp); +void ubifs_dump_lprops(struct ubifs_info *c); +void ubifs_dump_lpt_info(struct ubifs_info *c); +void ubifs_dump_leb(const struct ubifs_info *c, int lnum); +void ubifs_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); +void ubifs_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode); +void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + int cat); +void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip); +void ubifs_dump_tnc(struct ubifs_info *c); +void ubifs_dump_index(struct ubifs_info *c); +void ubifs_dump_lpt_lebs(const struct ubifs_info *c); int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, dbg_znode_callback znode_cb, void *priv); @@ -307,11 +299,10 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, - int len, int dtype); -int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, - int dtype); + int len); +int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); int dbg_leb_unmap(struct ubifs_info *c, int lnum); -int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype); +int dbg_leb_map(struct ubifs_info *c, int lnum); /* Debugfs-related stuff */ int dbg_debugfs_init(void); @@ -319,162 +310,4 @@ void dbg_debugfs_exit(void); int dbg_debugfs_init_fs(struct ubifs_info *c); void dbg_debugfs_exit_fs(struct ubifs_info *c); -#else /* !CONFIG_UBIFS_FS_DEBUG */ - -/* Use "if (0)" to make compiler check arguments even if debugging is off */ -#define ubifs_assert(expr) do { \ - if (0) \ - printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ - __func__, __LINE__, current->pid); \ -} while (0) - -#define dbg_err(fmt, ...) do { \ - if (0) \ - ubifs_err(fmt, ##__VA_ARGS__); \ -} while (0) - -#define DBGKEY(key) ((char *)(key)) -#define DBGKEY1(key) ((char *)(key)) - -#define ubifs_dbg_msg(fmt, ...) do { \ - if (0) \ - printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \ -} while (0) - -#define dbg_dump_stack() -#define ubifs_assert_cmt_locked(c) - -#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) - -static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } -static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } -static inline const char *dbg_ntype(int type) { return ""; } -static inline const char *dbg_cstate(int cmt_state) { return ""; } -static inline const char *dbg_jhead(int jhead) { return ""; } -static inline const char * -dbg_get_key_dump(const struct ubifs_info *c, - const union ubifs_key *key) { return ""; } -static inline const char * -dbg_snprintf_key(const struct ubifs_info *c, - const union ubifs_key *key, char *buffer, - int len) { return ""; } -static inline void dbg_dump_inode(struct ubifs_info *c, - const struct inode *inode) { return; } -static inline void dbg_dump_node(const struct ubifs_info *c, - const void *node) { return; } -static inline void dbg_dump_lpt_node(const struct ubifs_info *c, - void *node, int lnum, - int offs) { return; } -static inline void -dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; } -static inline void -dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; } -static inline void -dbg_dump_budg(struct ubifs_info *c, - const struct ubifs_budg_info *bi) { return; } -static inline void dbg_dump_lprop(const struct ubifs_info *c, - const struct ubifs_lprops *lp) { return; } -static inline void dbg_dump_lprops(struct ubifs_info *c) { return; } -static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } -static inline void dbg_dump_leb(const struct ubifs_info *c, - int lnum) { return; } -static inline void -dbg_dump_sleb(const struct ubifs_info *c, - const struct ubifs_scan_leb *sleb, int offs) { return; } -static inline void -dbg_dump_znode(const struct ubifs_info *c, - const struct ubifs_znode *znode) { return; } -static inline void dbg_dump_heap(struct ubifs_info *c, - struct ubifs_lpt_heap *heap, - int cat) { return; } -static inline void dbg_dump_pnode(struct ubifs_info *c, - struct ubifs_pnode *pnode, - struct ubifs_nnode *parent, - int iip) { return; } -static inline void dbg_dump_tnc(struct ubifs_info *c) { return; } -static inline void dbg_dump_index(struct ubifs_info *c) { return; } -static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c) { return; } - -static inline int dbg_walk_index(struct ubifs_info *c, - dbg_leaf_callback leaf_cb, - dbg_znode_callback znode_cb, - void *priv) { return 0; } -static inline void dbg_save_space_info(struct ubifs_info *c) { return; } -static inline int dbg_check_space_info(struct ubifs_info *c) { return 0; } -static inline int dbg_check_lprops(struct ubifs_info *c) { return 0; } -static inline int -dbg_old_index_check_init(struct ubifs_info *c, - struct ubifs_zbranch *zroot) { return 0; } -static inline int -dbg_check_old_index(struct ubifs_info *c, - struct ubifs_zbranch *zroot) { return 0; } -static inline int dbg_check_cats(struct ubifs_info *c) { return 0; } -static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; } -static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; } -static inline int dbg_chk_lpt_sz(struct ubifs_info *c, - int action, int len) { return 0; } -static inline int -dbg_check_synced_i_size(const struct ubifs_info *c, - struct inode *inode) { return 0; } -static inline int dbg_check_dir(struct ubifs_info *c, - const struct inode *dir) { return 0; } -static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; } -static inline int dbg_check_idx_size(struct ubifs_info *c, - long long idx_size) { return 0; } -static inline int dbg_check_filesystem(struct ubifs_info *c) { return 0; } -static inline void dbg_check_heap(struct ubifs_info *c, - struct ubifs_lpt_heap *heap, - int cat, int add_pos) { return; } -static inline int dbg_check_lpt_nodes(struct ubifs_info *c, - struct ubifs_cnode *cnode, int row, int col) { return 0; } -static inline int dbg_check_inode_size(struct ubifs_info *c, - const struct inode *inode, - loff_t size) { return 0; } -static inline int -dbg_check_data_nodes_order(struct ubifs_info *c, - struct list_head *head) { return 0; } -static inline int -dbg_check_nondata_nodes_order(struct ubifs_info *c, - struct list_head *head) { return 0; } - -static inline int dbg_leb_write(struct ubifs_info *c, int lnum, - const void *buf, int offset, - int len, int dtype) { return 0; } -static inline int dbg_leb_change(struct ubifs_info *c, int lnum, - const void *buf, int len, - int dtype) { return 0; } -static inline int dbg_leb_unmap(struct ubifs_info *c, int lnum) { return 0; } -static inline int dbg_leb_map(struct ubifs_info *c, int lnum, - int dtype) { return 0; } - -static inline int dbg_is_chk_gen(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_chk_index(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_chk_orph(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_chk_lprops(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_chk_fs(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_tst_rcvry(const struct ubifs_info *c) { return 0; } -static inline int dbg_is_power_cut(const struct ubifs_info *c) { return 0; } - -static inline int dbg_debugfs_init(void) { return 0; } -static inline void dbg_debugfs_exit(void) { return; } -static inline int dbg_debugfs_init_fs(struct ubifs_info *c) { return 0; } -static inline int dbg_debugfs_exit_fs(struct ubifs_info *c) { return 0; } - -#endif /* !CONFIG_UBIFS_FS_DEBUG */ #endif /* !__UBIFS_DEBUG_H__ */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ec9f187..62a2727 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -170,8 +170,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, return inode; } -#ifdef CONFIG_UBIFS_FS_DEBUG - static int dbg_check_name(const struct ubifs_info *c, const struct ubifs_dent_node *dent, const struct qstr *nm) @@ -185,12 +183,6 @@ static int dbg_check_name(const struct ubifs_info *c, return 0; } -#else - -#define dbg_check_name(c, dent, nm) 0 - -#endif - static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -1187,12 +1179,10 @@ const struct inode_operations ubifs_dir_inode_operations = { .rename = ubifs_rename, .setattr = ubifs_setattr, .getattr = ubifs_getattr, -#ifdef CONFIG_UBIFS_FS_XATTR .setxattr = ubifs_setxattr, .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, -#endif }; const struct file_operations ubifs_dir_operations = { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5c8f6dc..35389ca 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -97,7 +97,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, dump: ubifs_err("bad data node (block %u, inode %lu)", block, inode->i_ino); - dbg_dump_node(c, dn); + ubifs_dump_node(c, dn); return -EINVAL; } @@ -1562,12 +1562,10 @@ const struct address_space_operations ubifs_file_address_operations = { const struct inode_operations ubifs_file_inode_operations = { .setattr = ubifs_setattr, .getattr = ubifs_getattr, -#ifdef CONFIG_UBIFS_FS_XATTR .setxattr = ubifs_setxattr, .getxattr = ubifs_getxattr, .listxattr = ubifs_listxattr, .removexattr = ubifs_removexattr, -#endif }; const struct inode_operations ubifs_symlink_inode_operations = { diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index ded29f6..04dd6f4 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -109,7 +109,7 @@ static int switch_gc_head(struct ubifs_info *c) return err; c->gc_lnum = -1; - err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM); + err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0); return err; } diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 9228950..e18b988 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -109,13 +109,13 @@ int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, if (err && (err != -EBADMSG || even_ebadmsg)) { ubifs_err("reading %d bytes from LEB %d:%d failed, error %d", len, lnum, offs, err); - dbg_dump_stack(); + dump_stack(); } return err; } int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, - int len, int dtype) + int len) { int err; @@ -123,20 +123,19 @@ int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, if (c->ro_error) return -EROFS; if (!dbg_is_tst_rcvry(c)) - err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + err = ubi_leb_write(c->ubi, lnum, buf, offs, len); else - err = dbg_leb_write(c, lnum, buf, offs, len, dtype); + err = dbg_leb_write(c, lnum, buf, offs, len); if (err) { ubifs_err("writing %d bytes to LEB %d:%d failed, error %d", len, lnum, offs, err); ubifs_ro_mode(c, err); - dbg_dump_stack(); + dump_stack(); } return err; } -int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, - int dtype) +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len) { int err; @@ -144,14 +143,14 @@ int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, if (c->ro_error) return -EROFS; if (!dbg_is_tst_rcvry(c)) - err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + err = ubi_leb_change(c->ubi, lnum, buf, len); else - err = dbg_leb_change(c, lnum, buf, len, dtype); + err = dbg_leb_change(c, lnum, buf, len); if (err) { ubifs_err("changing %d bytes in LEB %d failed, error %d", len, lnum, err); ubifs_ro_mode(c, err); - dbg_dump_stack(); + dump_stack(); } return err; } @@ -170,12 +169,12 @@ int ubifs_leb_unmap(struct ubifs_info *c, int lnum) if (err) { ubifs_err("unmap LEB %d failed, error %d", lnum, err); ubifs_ro_mode(c, err); - dbg_dump_stack(); + dump_stack(); } return err; } -int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype) +int ubifs_leb_map(struct ubifs_info *c, int lnum) { int err; @@ -183,13 +182,13 @@ int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype) if (c->ro_error) return -EROFS; if (!dbg_is_tst_rcvry(c)) - err = ubi_leb_map(c->ubi, lnum, dtype); + err = ubi_leb_map(c->ubi, lnum); else - err = dbg_leb_map(c, lnum, dtype); + err = dbg_leb_map(c, lnum); if (err) { ubifs_err("mapping LEB %d failed, error %d", lnum, err); ubifs_ro_mode(c, err); - dbg_dump_stack(); + dump_stack(); } return err; } @@ -202,7 +201,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) if (err < 0) { ubifs_err("ubi_is_mapped failed for LEB %d, error %d", lnum, err); - dbg_dump_stack(); + dump_stack(); } return err; } @@ -294,8 +293,8 @@ out_len: out: if (!quiet) { ubifs_err("bad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); } return err; } @@ -523,8 +522,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dirt = sync_len - wbuf->used; if (dirt) ubifs_pad(c, wbuf->buf + wbuf->used, dirt); - err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len, - wbuf->dtype); + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len); if (err) return err; @@ -562,14 +560,12 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) * @wbuf: write-buffer * @lnum: logical eraseblock number to seek to * @offs: logical eraseblock offset to seek to - * @dtype: data type * * This function targets the write-buffer to logical eraseblock @lnum:@offs. * The write-buffer has to be empty. Returns zero in case of success and a * negative error code in case of failure. */ -int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, - int dtype) +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs) { const struct ubifs_info *c = wbuf->c; @@ -592,7 +588,6 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, wbuf->avail = wbuf->size; wbuf->used = 0; spin_unlock(&wbuf->lock); - wbuf->dtype = dtype; return 0; } @@ -719,8 +714,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, - wbuf->offs, wbuf->size, - wbuf->dtype); + wbuf->offs, wbuf->size); if (err) goto out; @@ -756,7 +750,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, - wbuf->size, wbuf->dtype); + wbuf->size); if (err) goto out; @@ -775,7 +769,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("write %d bytes to LEB %d:%d", wbuf->size, wbuf->lnum, wbuf->offs); err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs, - wbuf->size, wbuf->dtype); + wbuf->size); if (err) goto out; @@ -797,7 +791,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n, wbuf->dtype); + wbuf->offs, n); if (err) goto out; wbuf->offs += n; @@ -841,9 +835,9 @@ exit: out: ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", len, wbuf->lnum, wbuf->offs, err); - dbg_dump_node(c, buf); - dbg_dump_stack(); - dbg_dump_leb(c, wbuf->lnum); + ubifs_dump_node(c, buf); + dump_stack(); + ubifs_dump_leb(c, wbuf->lnum); return err; } @@ -854,7 +848,6 @@ out: * @len: node length * @lnum: logical eraseblock number * @offs: offset within the logical eraseblock - * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN) * * This function automatically fills node magic number, assigns sequence * number, and calculates node CRC checksum. The length of the @buf buffer has @@ -863,7 +856,7 @@ out: * success and a negative error code in case of failure. */ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, - int offs, int dtype) + int offs) { int err, buf_len = ALIGN(len, c->min_io_size); @@ -879,9 +872,9 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, return -EROFS; ubifs_prepare_node(c, buf, len, 1); - err = ubifs_leb_write(c, lnum, buf, offs, buf_len, dtype); + err = ubifs_leb_write(c, lnum, buf, offs, buf_len); if (err) - dbg_dump_node(c, buf); + ubifs_dump_node(c, buf); return err; } @@ -960,8 +953,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, out: ubifs_err("bad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); return -EINVAL; } @@ -1017,8 +1010,8 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, out: ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs, ubi_is_mapped(c->ubi, lnum)); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); return -EINVAL; } @@ -1056,7 +1049,6 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) */ size = c->max_write_size - (c->leb_start % c->max_write_size); wbuf->avail = wbuf->size = size; - wbuf->dtype = UBI_UNKNOWN; wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); spin_lock_init(&wbuf->lock); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 2f438ab..12c0f15 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -214,7 +214,7 @@ out: err = ubifs_add_bud_to_log(c, jhead, lnum, offs); if (err) goto out_return; - err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); + err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs); if (err) goto out_unlock; @@ -385,9 +385,9 @@ out: if (err == -ENOSPC) { /* This are some budgeting problems, print useful information */ down_write(&c->commit_sem); - dbg_dump_stack(); - dbg_dump_budg(c, &c->bi); - dbg_dump_lprops(c); + dump_stack(); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); } @@ -1267,7 +1267,6 @@ out_free: return err; } -#ifdef CONFIG_UBIFS_FS_XATTR /** * ubifs_jnl_delete_xattr - delete an extended attribute. @@ -1462,4 +1461,3 @@ out_free: return err; } -#endif /* CONFIG_UBIFS_FS_XATTR */ diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index f9fd068..c80b15d 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -29,11 +29,7 @@ #include "ubifs.h" -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_check_bud_bytes(struct ubifs_info *c); -#else -#define dbg_check_bud_bytes(c) 0 -#endif /** * ubifs_search_bud - search bud LEB. @@ -262,7 +258,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) * an unclean reboot, because the target LEB might have been * unmapped, but not yet physically erased. */ - err = ubifs_leb_map(c, bud->lnum, UBI_SHORTTERM); + err = ubifs_leb_map(c, bud->lnum); if (err) goto out_unlock; } @@ -270,7 +266,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) dbg_log("write ref LEB %d:%d", c->lhead_lnum, c->lhead_offs); err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum, - c->lhead_offs, UBI_SHORTTERM); + c->lhead_offs); if (err) goto out_unlock; @@ -422,7 +418,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) len = ALIGN(len, c->min_io_size); dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); - err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM); + err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len); if (err) goto out; @@ -623,7 +619,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, int sz = ALIGN(*offs, c->min_io_size), err; ubifs_pad(c, buf + *offs, sz - *offs); - err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, *lnum, buf, sz); if (err) return err; *lnum = ubifs_next_log_lnum(c, *lnum); @@ -702,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) int sz = ALIGN(offs, c->min_io_size); ubifs_pad(c, buf + offs, sz - offs); - err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, write_lnum, buf, sz); if (err) goto out_free; offs = ALIGN(offs, c->min_io_size); @@ -734,8 +730,6 @@ out_free: return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG - /** * dbg_check_bud_bytes - make sure bud bytes calculation are all right. * @c: UBIFS file-system description object @@ -767,5 +761,3 @@ static int dbg_check_bud_bytes(struct ubifs_info *c) return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index f8a181e..86eb8e5 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -447,7 +447,7 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) int new_cat = ubifs_categorize_lprops(c, lprops); if (old_cat == new_cat) { - struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1]; + struct ubifs_lpt_heap *heap; /* lprops on a heap now must be moved up or down */ if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT) @@ -846,7 +846,9 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c) return lprops; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** * dbg_check_cats - check category heaps and lists. @@ -1001,8 +1003,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, out: if (err) { dbg_msg("failed cat %d hpos %d err %d", cat, i, err); - dbg_dump_stack(); - dbg_dump_heap(c, heap, cat); + dump_stack(); + ubifs_dump_heap(c, heap, cat); } } @@ -1109,8 +1111,8 @@ static int scan_check_cb(struct ubifs_info *c, if (IS_ERR(sleb)) { ret = PTR_ERR(sleb); if (ret == -EUCLEAN) { - dbg_dump_lprops(c); - dbg_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); } goto out; } @@ -1237,7 +1239,7 @@ out_print: ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " "should be free %d, dirty %d", lnum, lp->free, lp->dirty, lp->flags, free, dirty); - dbg_dump_leb(c, lnum); + ubifs_dump_leb(c, lnum); out_destroy: ubifs_scan_destroy(sleb); ret = -EINVAL; @@ -1315,5 +1317,3 @@ int dbg_check_lprops(struct ubifs_info *c) out: return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 66d59d0..ce33b2b 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -701,8 +701,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -732,8 +731,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -780,8 +778,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum++, buf, alen, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -806,7 +803,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, alen = ALIGN(len, c->min_io_size); set_ltab(c, lnum, c->leb_size - alen, alen - len); memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum++, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum++, buf, alen); if (err) goto out; p = buf; @@ -826,7 +823,7 @@ int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, /* Write remaining buffer */ memset(p, 0xff, alen - len); - err = ubifs_leb_change(c, lnum, buf, alen, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, buf, alen); if (err) goto out; @@ -926,7 +923,7 @@ static int check_lpt_crc(void *buf, int len) if (crc != calc_crc) { ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc, calc_crc); - dbg_dump_stack(); + dump_stack(); return -EINVAL; } return 0; @@ -949,7 +946,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type) if (node_type != type) { ubifs_err("invalid type (%d) in LPT node type %d", node_type, type); - dbg_dump_stack(); + dump_stack(); return -EINVAL; } return 0; @@ -1247,7 +1244,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); - dbg_dump_stack(); + dump_stack(); kfree(nnode); return err; } @@ -1312,8 +1309,8 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) out: ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); - dbg_dump_pnode(c, pnode, parent, iip); - dbg_dump_stack(); + ubifs_dump_pnode(c, pnode, parent, iip); + dump_stack(); dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); kfree(pnode); return err; @@ -1740,16 +1737,20 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr) if (rd) { err = lpt_init_rd(c); if (err) - return err; + goto out_err; } if (wr) { err = lpt_init_wr(c); if (err) - return err; + goto out_err; } return 0; + +out_err: + ubifs_lpt_free(c, 0); + return err; } /** @@ -2080,8 +2081,6 @@ out: return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG - /** * dbg_chk_pnode - check a pnode. * @c: the UBIFS file-system description object @@ -2096,8 +2095,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, int i; if (pnode->num != col) { - dbg_err("pnode num %d expected %d parent num %d iip %d", - pnode->num, col, pnode->parent->num, pnode->iip); + ubifs_err("pnode num %d expected %d parent num %d iip %d", + pnode->num, col, pnode->parent->num, pnode->iip); return -EINVAL; } for (i = 0; i < UBIFS_LPT_FANOUT; i++) { @@ -2111,14 +2110,14 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, if (lnum >= c->leb_cnt) continue; if (lprops->lnum != lnum) { - dbg_err("bad LEB number %d expected %d", - lprops->lnum, lnum); + ubifs_err("bad LEB number %d expected %d", + lprops->lnum, lnum); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { if (cat != LPROPS_UNCAT) { - dbg_err("LEB %d taken but not uncat %d", - lprops->lnum, cat); + ubifs_err("LEB %d taken but not uncat %d", + lprops->lnum, cat); return -EINVAL; } continue; @@ -2130,8 +2129,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FRDI_IDX: break; default: - dbg_err("LEB %d index but cat %d", - lprops->lnum, cat); + ubifs_err("LEB %d index but cat %d", + lprops->lnum, cat); return -EINVAL; } } else { @@ -2143,8 +2142,8 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FREEABLE: break; default: - dbg_err("LEB %d not index but cat %d", - lprops->lnum, cat); + ubifs_err("LEB %d not index but cat %d", + lprops->lnum, cat); return -EINVAL; } } @@ -2184,24 +2183,24 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, break; } if (!found) { - dbg_err("LEB %d cat %d not found in cat heap/list", - lprops->lnum, cat); + ubifs_err("LEB %d cat %d not found in cat heap/list", + lprops->lnum, cat); return -EINVAL; } switch (cat) { case LPROPS_EMPTY: if (lprops->free != c->leb_size) { - dbg_err("LEB %d cat %d free %d dirty %d", - lprops->lnum, cat, lprops->free, - lprops->dirty); + ubifs_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); return -EINVAL; } case LPROPS_FREEABLE: case LPROPS_FRDI_IDX: if (lprops->free + lprops->dirty != c->leb_size) { - dbg_err("LEB %d cat %d free %d dirty %d", - lprops->lnum, cat, lprops->free, - lprops->dirty); + ubifs_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); return -EINVAL; } } @@ -2235,9 +2234,10 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, /* cnode is a nnode */ num = calc_nnode_num(row, col); if (cnode->num != num) { - dbg_err("nnode num %d expected %d " - "parent num %d iip %d", cnode->num, num, - (nnode ? nnode->num : 0), cnode->iip); + ubifs_err("nnode num %d expected %d " + "parent num %d iip %d", + cnode->num, num, + (nnode ? nnode->num : 0), cnode->iip); return -EINVAL; } nn = (struct ubifs_nnode *)cnode; @@ -2274,5 +2274,3 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, } return 0; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index cddd6bd..4fa7073 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -30,11 +30,7 @@ #include <linux/random.h> #include "ubifs.h" -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_populate_lsave(struct ubifs_info *c); -#else -#define dbg_populate_lsave(c) 0 -#endif /** * first_dirty_cnode - find first dirty cnode. @@ -324,11 +320,10 @@ static int layout_cnodes(struct ubifs_info *c) return 0; no_space: - ubifs_err("LPT out of space"); - dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " - "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); - dbg_dump_lpt_info(c); - dbg_dump_lpt_lebs(c); + ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " + "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); dump_stack(); return err; } @@ -421,7 +416,7 @@ static int write_cnodes(struct ubifs_info *c) alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); err = ubifs_leb_write(c, lnum, buf + from, from, - alen, UBI_SHORTTERM); + alen); if (err) return err; } @@ -479,8 +474,7 @@ static int write_cnodes(struct ubifs_info *c) wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, - UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; dbg_chk_lpt_sz(c, 2, c->leb_size - offs); @@ -506,8 +500,7 @@ static int write_cnodes(struct ubifs_info *c) wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, - UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; dbg_chk_lpt_sz(c, 2, c->leb_size - offs); @@ -531,7 +524,7 @@ static int write_cnodes(struct ubifs_info *c) wlen = offs - from; alen = ALIGN(wlen, c->min_io_size); memset(buf + offs, 0xff, alen - wlen); - err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, buf + from, from, alen); if (err) return err; @@ -552,11 +545,10 @@ static int write_cnodes(struct ubifs_info *c) return 0; no_space: - ubifs_err("LPT out of space mismatch"); - dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " - "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); - dbg_dump_lpt_info(c); - dbg_dump_lpt_lebs(c); + ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " + "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); dump_stack(); return err; } @@ -1497,7 +1489,9 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only) kfree(c->lpt_nod_buf); } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ /** * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes. @@ -1735,7 +1729,7 @@ int dbg_check_ltab(struct ubifs_info *c) for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { err = dbg_check_ltab_lnum(c, lnum); if (err) { - dbg_err("failed at LEB %d", lnum); + ubifs_err("failed at LEB %d", lnum); return err; } } @@ -1767,10 +1761,10 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) free += c->leb_size; } if (free < c->lpt_sz) { - dbg_err("LPT space error: free %lld lpt_sz %lld", - free, c->lpt_sz); - dbg_dump_lpt_info(c); - dbg_dump_lpt_lebs(c); + ubifs_err("LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); dump_stack(); return -EINVAL; } @@ -1807,13 +1801,13 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) d->chk_lpt_lebs = 0; d->chk_lpt_wastage = 0; if (c->dirty_pn_cnt > c->pnode_cnt) { - dbg_err("dirty pnodes %d exceed max %d", - c->dirty_pn_cnt, c->pnode_cnt); + ubifs_err("dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); err = -EINVAL; } if (c->dirty_nn_cnt > c->nnode_cnt) { - dbg_err("dirty nnodes %d exceed max %d", - c->dirty_nn_cnt, c->nnode_cnt); + ubifs_err("dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); err = -EINVAL; } return err; @@ -1830,23 +1824,23 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) chk_lpt_sz *= d->chk_lpt_lebs; chk_lpt_sz += len - c->nhead_offs; if (d->chk_lpt_sz != chk_lpt_sz) { - dbg_err("LPT wrote %lld but space used was %lld", - d->chk_lpt_sz, chk_lpt_sz); + ubifs_err("LPT wrote %lld but space used was %lld", + d->chk_lpt_sz, chk_lpt_sz); err = -EINVAL; } if (d->chk_lpt_sz > c->lpt_sz) { - dbg_err("LPT wrote %lld but lpt_sz is %lld", - d->chk_lpt_sz, c->lpt_sz); + ubifs_err("LPT wrote %lld but lpt_sz is %lld", + d->chk_lpt_sz, c->lpt_sz); err = -EINVAL; } if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) { - dbg_err("LPT layout size %lld but wrote %lld", - d->chk_lpt_sz, d->chk_lpt_sz2); + ubifs_err("LPT layout size %lld but wrote %lld", + d->chk_lpt_sz, d->chk_lpt_sz2); err = -EINVAL; } if (d->chk_lpt_sz2 && d->new_nhead_offs != len) { - dbg_err("LPT new nhead offs: expected %d was %d", - d->new_nhead_offs, len); + ubifs_err("LPT new nhead offs: expected %d was %d", + d->new_nhead_offs, len); err = -EINVAL; } lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; @@ -1855,13 +1849,13 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) if (c->big_lpt) lpt_sz += c->lsave_sz; if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) { - dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", - d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); + ubifs_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); err = -EINVAL; } if (err) { - dbg_dump_lpt_info(c); - dbg_dump_lpt_lebs(c); + ubifs_dump_lpt_info(c); + ubifs_dump_lpt_lebs(c); dump_stack(); } d->chk_lpt_sz2 = d->chk_lpt_sz; @@ -1880,7 +1874,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) } /** - * dbg_dump_lpt_leb - dump an LPT LEB. + * ubifs_dump_lpt_leb - dump an LPT LEB. * @c: UBIFS file-system description object * @lnum: LEB number to dump * @@ -1986,13 +1980,13 @@ out: } /** - * dbg_dump_lpt_lebs - dump LPT lebs. + * ubifs_dump_lpt_lebs - dump LPT lebs. * @c: UBIFS file-system description object * * This function dumps all LPT LEBs. The caller has to make sure the LPT is * locked. */ -void dbg_dump_lpt_lebs(const struct ubifs_info *c) +void ubifs_dump_lpt_lebs(const struct ubifs_info *c) { int i; @@ -2046,5 +2040,3 @@ static int dbg_populate_lsave(struct ubifs_info *c) return 1; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 278c238..ab83ace 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -241,7 +241,7 @@ static int validate_master(const struct ubifs_info *c) out: ubifs_err("bad master node at offset %d error %d", c->mst_offs, err); - dbg_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node); return -EINVAL; } @@ -317,7 +317,7 @@ int ubifs_read_master(struct ubifs_info *c) if (c->leb_cnt < old_leb_cnt || c->leb_cnt < UBIFS_MIN_LEB_CNT) { ubifs_err("bad leb_cnt on master node"); - dbg_dump_node(c, c->mst_node); + ubifs_dump_node(c, c->mst_node); return -EINVAL; } @@ -379,7 +379,7 @@ int ubifs_write_master(struct ubifs_info *c) c->mst_offs = offs; c->mst_node->highest_inum = cpu_to_le64(c->highest_inum); - err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + err = ubifs_write_node(c, c->mst_node, len, lnum, offs); if (err) return err; @@ -390,7 +390,7 @@ int ubifs_write_master(struct ubifs_info *c) if (err) return err; } - err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + err = ubifs_write_node(c, c->mst_node, len, lnum, offs); return err; } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index c542c73..b02734d 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -52,11 +52,7 @@ * than the maximum number of orphans allowed. */ -#ifdef CONFIG_UBIFS_FS_DEBUG static int dbg_check_orphans(struct ubifs_info *c); -#else -#define dbg_check_orphans(c) 0 -#endif /** * ubifs_add_orphan - add an orphan. @@ -92,7 +88,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) else if (inum > o->inum) p = &(*p)->rb_right; else { - dbg_err("orphaned twice"); + ubifs_err("orphaned twice"); spin_unlock(&c->orphan_lock); kfree(orphan); return 0; @@ -158,8 +154,8 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) } } spin_unlock(&c->orphan_lock); - dbg_err("missing orphan ino %lu", (unsigned long)inum); - dbg_dump_stack(); + ubifs_err("missing orphan ino %lu", (unsigned long)inum); + dump_stack(); } /** @@ -248,8 +244,7 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) ubifs_assert(c->ohead_offs == 0); ubifs_prepare_node(c, c->orph_buf, len, 1); len = ALIGN(len, c->min_io_size); - err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len, - UBI_SHORTTERM); + err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len); } else { if (c->ohead_offs == 0) { /* Ensure LEB has been unmapped */ @@ -258,7 +253,7 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) return err; } err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum, - c->ohead_offs, UBI_SHORTTERM); + c->ohead_offs); } return err; } @@ -569,7 +564,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (snod->type != UBIFS_ORPH_NODE) { ubifs_err("invalid node type %d in orphan area at " "%d:%d", snod->type, sleb->lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); return -EINVAL; } @@ -597,7 +592,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ubifs_err("out of order commit number %llu in " "orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); return -EINVAL; } dbg_rcvry("out of date LEB %d", sleb->lnum); @@ -725,7 +720,9 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only) return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG +/* + * Everything below is related to debugging. + */ struct check_orphan { struct rb_node rb; @@ -968,5 +965,3 @@ out: kfree(ci.node); return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 2a935b3..c30d976b 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -213,10 +213,10 @@ static int write_rcvrd_mst_node(struct ubifs_info *c, mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); - err = ubifs_leb_change(c, lnum, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, mst, sz); if (err) goto out; - err = ubifs_leb_change(c, lnum + 1, mst, sz, UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum + 1, mst, sz); if (err) goto out; out: @@ -362,12 +362,12 @@ out_err: out_free: ubifs_err("failed to recover master node"); if (mst1) { - dbg_err("dumping first master node"); - dbg_dump_node(c, mst1); + ubifs_err("dumping first master node"); + ubifs_dump_node(c, mst1); } if (mst2) { - dbg_err("dumping second master node"); - dbg_dump_node(c, mst2); + ubifs_err("dumping second master node"); + ubifs_dump_node(c, mst2); } vfree(buf2); vfree(buf1); @@ -555,8 +555,7 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ubifs_pad(c, buf, pad_len); } } - err = ubifs_leb_change(c, lnum, sleb->buf, len, - UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sleb->buf, len); if (err) return err; } @@ -683,7 +682,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, ret, lnum, offs); break; } else { - dbg_err("unexpected return value %d", ret); + ubifs_err("unexpected return value %d", ret); err = -EINVAL; goto error; } @@ -789,7 +788,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, corrupted_rescan: /* Re-scan the corrupted data with verbose messages */ - dbg_err("corruptio %d", ret); + ubifs_err("corruptio %d", ret); ubifs_scan_a_node(c, buf, len, lnum, offs, 1); corrupted: ubifs_scanned_corruption(c, lnum, offs, buf); @@ -827,17 +826,17 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, goto out_free; ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); if (ret != SCANNED_A_NODE) { - dbg_err("Not a valid node"); + ubifs_err("Not a valid node"); goto out_err; } if (cs_node->ch.node_type != UBIFS_CS_NODE) { - dbg_err("Node a CS node, type is %d", cs_node->ch.node_type); + ubifs_err("Node a CS node, type is %d", cs_node->ch.node_type); goto out_err; } if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { - dbg_err("CS node cmt_no %llu != current cmt_no %llu", - (unsigned long long)le64_to_cpu(cs_node->cmt_no), - c->cmt_no); + ubifs_err("CS node cmt_no %llu != current cmt_no %llu", + (unsigned long long)le64_to_cpu(cs_node->cmt_no), + c->cmt_no); goto out_err; } *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum); @@ -941,7 +940,7 @@ static int recover_head(struct ubifs_info *c, int lnum, int offs, void *sbuf) err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1); if (err) return err; - return ubifs_leb_change(c, lnum, sbuf, offs, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, sbuf, offs); } return 0; @@ -1071,7 +1070,7 @@ static int clean_an_unclean_leb(struct ubifs_info *c, } /* Write back the LEB atomically */ - err = ubifs_leb_change(c, lnum, sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, sbuf, len); if (err) return err; @@ -1138,9 +1137,9 @@ static int grab_empty_leb(struct ubifs_info *c) */ lnum = ubifs_find_free_leb_for_idx(c); if (lnum < 0) { - dbg_err("could not find an empty LEB"); - dbg_dump_lprops(c); - dbg_dump_budg(c, &c->bi); + ubifs_err("could not find an empty LEB"); + ubifs_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); return lnum; } @@ -1218,7 +1217,7 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) } mutex_unlock(&wbuf->io_mutex); if (err < 0) { - dbg_err("GC failed, error %d", err); + ubifs_err("GC failed, error %d", err); if (err == -EAGAIN) err = -EINVAL; return err; @@ -1472,7 +1471,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) len -= 1; len = ALIGN(len + 1, c->min_io_size); /* Atomically write the fixed LEB back again */ - err = ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN); + err = ubifs_leb_change(c, lnum, c->sbuf, len); if (err) goto out; dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b007637..3a2da7e 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -154,8 +154,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) /* Make sure the journal head points to the latest bud */ err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf, - b->bud->lnum, c->leb_size - b->free, - UBI_SHORTTERM); + b->bud->lnum, c->leb_size - b->free); out: ubifs_release_lprops(c); @@ -686,7 +685,7 @@ out: out_dump: ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); ubifs_scan_destroy(sleb); return -EINVAL; } @@ -861,16 +860,16 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) * numbers. */ if (snod->type != UBIFS_CS_NODE) { - dbg_err("first log node at LEB %d:%d is not CS node", - lnum, offs); + ubifs_err("first log node at LEB %d:%d is not CS node", + lnum, offs); goto out_dump; } if (le64_to_cpu(node->cmt_no) != c->cmt_no) { - dbg_err("first CS node at LEB %d:%d has wrong " - "commit number %llu expected %llu", - lnum, offs, - (unsigned long long)le64_to_cpu(node->cmt_no), - c->cmt_no); + ubifs_err("first CS node at LEB %d:%d has wrong " + "commit number %llu expected %llu", + lnum, offs, + (unsigned long long)le64_to_cpu(node->cmt_no), + c->cmt_no); goto out_dump; } @@ -892,7 +891,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) /* Make sure the first node sits at offset zero of the LEB */ if (snod->offs != 0) { - dbg_err("first node is not at zero offset"); + ubifs_err("first node is not at zero offset"); goto out_dump; } @@ -905,8 +904,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) } if (snod->sqnum < c->cs_sqnum) { - dbg_err("bad sqnum %llu, commit sqnum %llu", - snod->sqnum, c->cs_sqnum); + ubifs_err("bad sqnum %llu, commit sqnum %llu", + snod->sqnum, c->cs_sqnum); goto out_dump; } @@ -958,7 +957,7 @@ out: out_dump: ubifs_err("log error detected while replaying the log at LEB %d:%d", lnum, offs + snod->offs); - dbg_dump_node(c, snod->node); + ubifs_dump_node(c, snod->node); ubifs_scan_destroy(sleb); return -EINVAL; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 771f7fb..ef3d1ba 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -130,7 +130,6 @@ static int create_default_filesystem(struct ubifs_info *c) * orphan node. */ orph_lebs = UBIFS_MIN_ORPH_LEBS; -#ifdef CONFIG_UBIFS_FS_DEBUG if (c->leb_cnt - min_leb_cnt > 1) /* * For debugging purposes it is better to have at least 2 @@ -138,7 +137,6 @@ static int create_default_filesystem(struct ubifs_info *c) * consolidations and would be stressed more. */ orph_lebs += 1; -#endif main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs; main_lebs -= orph_lebs; @@ -196,7 +194,7 @@ static int create_default_filesystem(struct ubifs_info *c) sup->rp_size = cpu_to_le64(tmp64); sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); - err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); + err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0); kfree(sup); if (err) return err; @@ -252,14 +250,13 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); - err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0); if (err) { kfree(mst); return err; } - err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, + 0); kfree(mst); if (err) return err; @@ -282,8 +279,7 @@ static int create_default_filesystem(struct ubifs_info *c) key_write_idx(c, &key, &br->key); br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB); br->len = cpu_to_le32(UBIFS_INO_NODE_SZ); - err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0, - UBI_UNKNOWN); + err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0); kfree(idx); if (err) return err; @@ -315,8 +311,7 @@ static int create_default_filesystem(struct ubifs_info *c) ino->flags = cpu_to_le32(UBIFS_COMPR_FL); err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, - main_first + DEFAULT_DATA_LEB, 0, - UBI_UNKNOWN); + main_first + DEFAULT_DATA_LEB, 0); kfree(ino); if (err) return err; @@ -335,8 +330,7 @@ static int create_default_filesystem(struct ubifs_info *c) return -ENOMEM; cs->ch.node_type = UBIFS_CS_NODE; - err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, - 0, UBI_UNKNOWN); + err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0); kfree(cs); ubifs_msg("default file-system created"); @@ -475,7 +469,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) failed: ubifs_err("bad superblock, error %d", err); - dbg_dump_node(c, sup); + ubifs_dump_node(c, sup); return -EINVAL; } @@ -518,7 +512,7 @@ int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup) int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1); - return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM); + return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len); } /** @@ -691,7 +685,7 @@ static int fixup_leb(struct ubifs_info *c, int lnum, int len) if (err) return err; - return ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN); + return ubifs_leb_change(c, lnum, c->sbuf, len); } /** diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 37383e8..7c40e60 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -101,7 +101,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, if (!quiet) { ubifs_err("bad pad node at LEB %d:%d", lnum, offs); - dbg_dump_node(c, pad); + ubifs_dump_node(c, pad); } return SCANNED_A_BAD_PAD_NODE; } @@ -109,8 +109,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, /* Make the node pads to 8-byte boundary */ if ((node_len + pad_len) & 7) { if (!quiet) - dbg_err("bad padding length %d - %d", - offs, offs + node_len + pad_len); + ubifs_err("bad padding length %d - %d", + offs, offs + node_len + pad_len); return SCANNED_A_BAD_PAD_NODE; } @@ -245,7 +245,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, len = c->leb_size - offs; if (len > 8192) len = 8192; - dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs); + ubifs_err("first %d bytes from LEB %d:%d", len, lnum, offs); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); } @@ -300,16 +300,16 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, switch (ret) { case SCANNED_GARBAGE: - dbg_err("garbage"); + ubifs_err("garbage"); goto corrupted; case SCANNED_A_NODE: break; case SCANNED_A_CORRUPT_NODE: case SCANNED_A_BAD_PAD_NODE: - dbg_err("bad node"); + ubifs_err("bad node"); goto corrupted; default: - dbg_err("unknown"); + ubifs_err("unknown"); err = -EINVAL; goto error; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 76e4e05..5862dd9 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -246,8 +246,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) out_invalid: ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err); - dbg_dump_node(c, ino); - dbg_dump_inode(c, inode); + ubifs_dump_node(c, ino); + ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); @@ -378,7 +378,7 @@ out: smp_wmb(); } done: - end_writeback(inode); + clear_inode(inode); } static void ubifs_dirty_inode(struct inode *inode, int flags) @@ -668,8 +668,8 @@ static int init_constants_sb(struct ubifs_info *c) tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; tmp = ALIGN(tmp, c->min_io_size); if (tmp > c->leb_size) { - dbg_err("too small LEB size %d, at least %d needed", - c->leb_size, tmp); + ubifs_err("too small LEB size %d, at least %d needed", + c->leb_size, tmp); return -EINVAL; } @@ -683,8 +683,8 @@ static int init_constants_sb(struct ubifs_info *c) tmp /= c->leb_size; tmp += 1; if (c->log_lebs < tmp) { - dbg_err("too small log %d LEBs, required min. %d LEBs", - c->log_lebs, tmp); + ubifs_err("too small log %d LEBs, required min. %d LEBs", + c->log_lebs, tmp); return -EINVAL; } @@ -813,13 +813,10 @@ static int alloc_wbufs(struct ubifs_info *c) c->jheads[i].grouped = 1; } - c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; /* - * Garbage Collector head likely contains long-term data and - * does not need to be synchronized by timer. Also GC head nodes are - * not grouped. + * Garbage Collector head does not need to be synchronized by timer. + * Also GC head nodes are not grouped. */ - c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; c->jheads[GCHD].wbuf.no_timer = 1; c->jheads[GCHD].grouped = 0; @@ -863,7 +860,7 @@ static void free_orphans(struct ubifs_info *c) orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); list_del(&orph->list); kfree(orph); - dbg_err("orphan list not empty at unmount"); + ubifs_err("orphan list not empty at unmount"); } vfree(c->orph_buf); @@ -1147,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c) ubifs_assert(c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { ubifs_err("insufficient free space to mount in R/W mode"); - dbg_dump_budg(c, &c->bi); - dbg_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); return -ENOSPC; } return 0; @@ -1301,7 +1298,7 @@ static int mount_ubifs(struct ubifs_info *c) if (!c->ro_mount && c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - goto out_master; + goto out_lpt; } if (!c->ro_mount) { @@ -2126,8 +2123,8 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - dbg_err("cannot open \"%s\", error %d", - name, (int)PTR_ERR(ubi)); + ubifs_err("cannot open \"%s\", error %d", + name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 16ad84d..349f31a 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -339,8 +339,8 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, dent); if (err) { - dbg_dump_stack(); - dbg_dump_node(c, dent); + dump_stack(); + ubifs_dump_node(c, dent); return err; } @@ -372,8 +372,8 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_validate_entry(c, node); if (err) { - dbg_dump_stack(); - dbg_dump_node(c, node); + dump_stack(); + ubifs_dump_node(c, node); return err; } @@ -1733,8 +1733,8 @@ out_err: err = -EINVAL; out: ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); - dbg_dump_node(c, buf); - dbg_dump_stack(); + ubifs_dump_node(c, buf); + dump_stack(); return err; } @@ -1775,7 +1775,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) if (err && err != -EBADMSG) { ubifs_err("failed to read from LEB %d:%d, error %d", lnum, offs, err); - dbg_dump_stack(); + dump_stack(); dbg_tnck(&bu->key, "key "); return err; } @@ -2361,7 +2361,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, * by passing 'ubifs_tnc_remove_nm()' the same key but * an unmatchable name. */ - struct qstr noname = { .len = 0, .name = "" }; + struct qstr noname = { .name = "" }; err = dbg_check_tnc(c, 0); mutex_unlock(&c->tnc_mutex); @@ -2403,7 +2403,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) err = ubifs_add_dirt(c, zbr->lnum, zbr->len); if (err) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); return err; } @@ -2649,7 +2649,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, err = ubifs_add_dirt(c, znode->zbranch[i].lnum, znode->zbranch[i].len); if (err) { - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); goto out_unlock; } dbg_tnck(key, "removing key "); @@ -3275,8 +3275,6 @@ out_unlock: return err; } -#ifdef CONFIG_UBIFS_FS_DEBUG - /** * dbg_check_inode_size - check if inode size is correct. * @c: UBIFS file-system description object @@ -3335,13 +3333,11 @@ out_dump: (unsigned long)inode->i_ino, size, ((loff_t)block) << UBIFS_BLOCK_SHIFT); mutex_unlock(&c->tnc_mutex); - dbg_dump_inode(c, inode); - dbg_dump_stack(); + ubifs_dump_inode(c, inode); + dump_stack(); return -EINVAL; out_unlock: mutex_unlock(&c->tnc_mutex); return err; } - -#endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 4c15f07..523bbad 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -54,18 +54,16 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, br->len = cpu_to_le32(zbr->len); if (!zbr->lnum || !zbr->len) { ubifs_err("bad ref in znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zbr->znode) - dbg_dump_znode(c, zbr->znode); + ubifs_dump_znode(c, zbr->znode); } } ubifs_prepare_node(c, idx, len, 0); -#ifdef CONFIG_UBIFS_FS_DEBUG znode->lnum = lnum; znode->offs = offs; znode->len = len; -#endif err = insert_old_idx_znode(c, znode); @@ -322,8 +320,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p) 0, 0, 0); if (err) return err; - err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len, - UBI_SHORTTERM); + err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len); if (err) return err; dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); @@ -388,8 +385,8 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) * option which forces in-the-gaps is enabled. */ ubifs_warn("out of space"); - dbg_dump_budg(c, &c->bi); - dbg_dump_lprops(c); + ubifs_dump_budg(c, &c->bi); + ubifs_dump_lprops(c); } /* Try to commit anyway */ err = 0; @@ -456,11 +453,9 @@ static int layout_in_empty_space(struct ubifs_info *c) offs = buf_offs + used; -#ifdef CONFIG_UBIFS_FS_DEBUG znode->lnum = lnum; znode->offs = offs; znode->len = len; -#endif /* Update the parent */ zp = znode->parent; @@ -536,10 +531,8 @@ static int layout_in_empty_space(struct ubifs_info *c) break; } -#ifdef CONFIG_UBIFS_FS_DEBUG c->dbg->new_ihead_lnum = lnum; c->dbg->new_ihead_offs = buf_offs; -#endif return 0; } @@ -864,9 +857,9 @@ static int write_index(struct ubifs_info *c) br->len = cpu_to_le32(zbr->len); if (!zbr->lnum || !zbr->len) { ubifs_err("bad ref in znode"); - dbg_dump_znode(c, znode); + ubifs_dump_znode(c, znode); if (zbr->znode) - dbg_dump_znode(c, zbr->znode); + ubifs_dump_znode(c, zbr->znode); } } len = ubifs_idx_node_sz(c, znode->child_cnt); @@ -881,13 +874,11 @@ static int write_index(struct ubifs_info *c) } offs = buf_offs + used; -#ifdef CONFIG_UBIFS_FS_DEBUG if (lnum != znode->lnum || offs != znode->offs || len != znode->len) { ubifs_err("inconsistent znode posn"); return -EINVAL; } -#endif /* Grab some stuff from znode while we still can */ cnext = znode->cnext; @@ -959,8 +950,7 @@ static int write_index(struct ubifs_info *c) } /* The buffer is full or there are no more znodes to do */ - err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen, - UBI_SHORTTERM); + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen); if (err) return err; buf_offs += blen; @@ -982,13 +972,11 @@ static int write_index(struct ubifs_info *c) break; } -#ifdef CONFIG_UBIFS_FS_DEBUG if (lnum != c->dbg->new_ihead_lnum || buf_offs != c->dbg->new_ihead_offs) { ubifs_err("inconsistent ihead"); return -EINVAL; } -#endif c->ihead_lnum = lnum; c->ihead_offs = buf_offs; diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index dc28fe6..d38ac7f 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -293,10 +293,10 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, lnum, offs, znode->level, znode->child_cnt); if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) { - dbg_err("current fanout %d, branch count %d", - c->fanout, znode->child_cnt); - dbg_err("max levels %d, znode level %d", - UBIFS_MAX_LEVELS, znode->level); + ubifs_err("current fanout %d, branch count %d", + c->fanout, znode->child_cnt); + ubifs_err("max levels %d, znode level %d", + UBIFS_MAX_LEVELS, znode->level); err = 1; goto out_dump; } @@ -316,7 +316,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, if (zbr->lnum < c->main_first || zbr->lnum >= c->leb_cnt || zbr->offs < 0 || zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) { - dbg_err("bad branch %d", i); + ubifs_err("bad branch %d", i); err = 2; goto out_dump; } @@ -340,19 +340,19 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, type = key_type(c, &zbr->key); if (c->ranges[type].max_len == 0) { if (zbr->len != c->ranges[type].len) { - dbg_err("bad target node (type %d) length (%d)", - type, zbr->len); - dbg_err("have to be %d", c->ranges[type].len); + ubifs_err("bad target node (type %d) length (%d)", + type, zbr->len); + ubifs_err("have to be %d", c->ranges[type].len); err = 4; goto out_dump; } } else if (zbr->len < c->ranges[type].min_len || zbr->len > c->ranges[type].max_len) { - dbg_err("bad target node (type %d) length (%d)", - type, zbr->len); - dbg_err("have to be in range of %d-%d", - c->ranges[type].min_len, - c->ranges[type].max_len); + ubifs_err("bad target node (type %d) length (%d)", + type, zbr->len); + ubifs_err("have to be in range of %d-%d", + c->ranges[type].min_len, + c->ranges[type].max_len); err = 5; goto out_dump; } @@ -370,13 +370,13 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, cmp = keys_cmp(c, key1, key2); if (cmp > 0) { - dbg_err("bad key order (keys %d and %d)", i, i + 1); + ubifs_err("bad key order (keys %d and %d)", i, i + 1); err = 6; goto out_dump; } else if (cmp == 0 && !is_hash_key(c, key1)) { /* These can only be keys with colliding hash */ - dbg_err("keys %d and %d are not hashed but equivalent", - i, i + 1); + ubifs_err("keys %d and %d are not hashed but equivalent", + i, i + 1); err = 7; goto out_dump; } @@ -387,7 +387,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, out_dump: ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err); - dbg_dump_node(c, idx); + ubifs_dump_node(c, idx); kfree(idx); return -EINVAL; } @@ -486,7 +486,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, zbr->lnum, zbr->offs); dbg_tnck(key, "looked for key "); dbg_tnck(&key1, "but found node's key "); - dbg_dump_node(c, node); + ubifs_dump_node(c, node); return -EINVAL; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 93d59ac..1e5a086 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -650,8 +650,6 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @avail: number of bytes available in the write-buffer * @used: number of used bytes in the write-buffer * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) - * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, - * %UBI_UNKNOWN) * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep * up by 'mutex_lock_nested()). * @sync_callback: write-buffer synchronization callback @@ -685,7 +683,6 @@ struct ubifs_wbuf { int avail; int used; int size; - int dtype; int jhead; int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); struct mutex io_mutex; @@ -762,6 +759,9 @@ struct ubifs_zbranch { * @offs: offset of the corresponding indexing node * @len: length of the corresponding indexing node * @zbranch: array of znode branches (@c->fanout elements) + * + * Note! The @lnum, @offs, and @len fields are not really needed - we have them + * only for internal consistency check. They could be removed to save some RAM. */ struct ubifs_znode { struct ubifs_znode *parent; @@ -772,9 +772,9 @@ struct ubifs_znode { int child_cnt; int iip; int alt; -#ifdef CONFIG_UBIFS_FS_DEBUG - int lnum, offs, len; -#endif + int lnum; + int offs; + int len; struct ubifs_zbranch zbranch[]; }; @@ -1444,9 +1444,7 @@ struct ubifs_info { struct rb_root size_tree; struct ubifs_mount_opts mount_opts; -#ifdef CONFIG_UBIFS_FS_DEBUG struct ubifs_debug_info *dbg; -#endif }; extern struct list_head ubifs_infos; @@ -1468,22 +1466,20 @@ void ubifs_ro_mode(struct ubifs_info *c, int err); int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, int len, int even_ebadmsg); int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, - int len, int dtype); -int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len, - int dtype); + int len); +int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len); int ubifs_leb_unmap(struct ubifs_info *c, int lnum); -int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype); +int ubifs_leb_map(struct ubifs_info *c, int lnum); int ubifs_is_mapped(const struct ubifs_info *c, int lnum); int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); -int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, - int dtype); +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs); int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf); int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs); int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, int lnum, int offs); int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, - int offs, int dtype); + int offs); int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, int offs, int quiet, int must_chk_crc); void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 85b2722..0f7139b 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -298,7 +298,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name, { struct inode *inode, *host = dentry->d_inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = { .name = name, .len = strlen(name) }; + struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; union ubifs_key key; int err, type; @@ -361,7 +361,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, { struct inode *inode, *host = dentry->d_inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = { .name = name, .len = strlen(name) }; + struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_inode *ui; struct ubifs_dent_node *xent; union ubifs_key key; @@ -399,8 +399,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, if (buf) { /* If @buf is %NULL we are supposed to return the length */ if (ui->data_len > size) { - dbg_err("buffer size %zd, xattr len %d", - size, ui->data_len); + ubifs_err("buffer size %zd, xattr len %d", + size, ui->data_len); err = -ERANGE; goto out_iput; } @@ -524,7 +524,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) { struct inode *inode, *host = dentry->d_inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = { .name = name, .len = strlen(name) }; + struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; union ubifs_key key; int err; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7d75280..873e1ba 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -80,7 +80,7 @@ void udf_evict_inode(struct inode *inode) } else truncate_inode_pages(&inode->i_data, 0); invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 38de8f2..a165c66 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1193,7 +1193,7 @@ static struct dentry *udf_get_parent(struct dentry *child) { struct kernel_lb_addr tloc; struct inode *inode = NULL; - struct qstr dotdot = {.name = "..", .len = 2}; + struct qstr dotdot = QSTR_INIT("..", 2); struct fileIdentDesc cfi; struct udf_fileident_bh fibh; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7cdd395..dd7c89d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -895,7 +895,7 @@ void ufs_evict_inode(struct inode * inode) } invalidate_inode_buffers(inode); - end_writeback(inode); + clear_inode(inode); if (want_delete) { lock_ufs(inode->i_sb); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index ac8e279..302f340 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -146,10 +146,7 @@ static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid, static struct dentry *ufs_get_parent(struct dentry *child) { - struct qstr dot_dot = { - .name = "..", - .len = 2, - }; + struct qstr dot_dot = QSTR_INIT("..", 2); ino_t ino; ino = ufs_inode_by_name(child->d_inode, &dot_dot); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 0a99779..d2bf974 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-y += xfs_aops.o \ xfs_discard.o \ xfs_error.o \ xfs_export.o \ + xfs_extent_busy.o \ xfs_file.o \ xfs_filestream.o \ xfs_fsops.o \ @@ -49,7 +50,6 @@ xfs-y += xfs_aops.o \ xfs_sync.o \ xfs_xattr.o \ xfs_rename.o \ - xfs_rw.o \ xfs_utils.o \ xfs_vnodeops.o \ kmem.o \ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 4805f00..44d65c1 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,24 +175,6 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that - * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_alloc_busy_insert() for details. - */ -struct xfs_busy_extent { - struct rb_node rb_node; /* ag by-bno indexed search tree */ - struct list_head list; /* transaction busy extent list */ - xfs_agnumber_t agno; - xfs_agblock_t bno; - xfs_extlen_t length; - unsigned int flags; -#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ -#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */ -}; - -/* * Per-ag incore structure, copies of information in agf and agi, * to improve the performance of allocation group selection. */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 0f0df27..229641f 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -32,6 +31,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -47,8 +47,6 @@ STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); -STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *, - xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *); /* * Lookup the record equal to [bno, len] in the btree given by cur. @@ -152,7 +150,7 @@ xfs_alloc_compute_aligned( xfs_extlen_t len; /* Trim busy sections out of found extent */ - xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); + xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); @@ -536,7 +534,7 @@ xfs_alloc_ag_vextent( if (error) return error; - ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, + ASSERT(!xfs_extent_busy_search(args->mp, args->agno, args->agbno, args->len)); } @@ -603,7 +601,7 @@ xfs_alloc_ag_vextent_exact( /* * Check for overlapping busy extents. */ - xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); + xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -1391,7 +1389,7 @@ xfs_alloc_ag_vextent_small( if (error) goto error0; if (fbno != NULLAGBLOCK) { - xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, args->userdata); if (args->userdata) { @@ -2496,579 +2494,8 @@ xfs_free_extent( error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); if (!error) - xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0); + xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); error0: xfs_perag_put(args.pag); return error; } - -void -xfs_alloc_busy_insert( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - unsigned int flags) -{ - struct xfs_busy_extent *new; - struct xfs_busy_extent *busyp; - struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent = NULL; - - new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); - if (!new) { - /* - * No Memory! Since it is now not possible to track the free - * block, make this a synchronous transaction to insure that - * the block is not reused before this transaction commits. - */ - trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len); - xfs_trans_set_sync(tp); - return; - } - - new->agno = agno; - new->bno = bno; - new->length = len; - INIT_LIST_HEAD(&new->list); - new->flags = flags; - - /* trace before insert to be able to see failed inserts */ - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); - - pag = xfs_perag_get(tp->t_mountp, new->agno); - spin_lock(&pag->pagb_lock); - rbp = &pag->pagb_tree.rb_node; - while (*rbp) { - parent = *rbp; - busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); - - if (new->bno < busyp->bno) { - rbp = &(*rbp)->rb_left; - ASSERT(new->bno + new->length <= busyp->bno); - } else if (new->bno > busyp->bno) { - rbp = &(*rbp)->rb_right; - ASSERT(bno >= busyp->bno + busyp->length); - } else { - ASSERT(0); - } - } - - rb_link_node(&new->rb_node, parent, rbp); - rb_insert_color(&new->rb_node, &pag->pagb_tree); - - list_add(&new->list, &tp->t_busy); - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); -} - -/* - * Search for a busy extent within the range of the extent we are about to - * allocate. You need to be holding the busy extent tree lock when calling - * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy - * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact - * match. This is done so that a non-zero return indicates an overlap that - * will require a synchronous transaction, but it can still be - * used to distinguish between a partial or exact match. - */ -int -xfs_alloc_busy_search( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) -{ - struct xfs_perag *pag; - struct rb_node *rbp; - struct xfs_busy_extent *busyp; - int match = 0; - - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); - - rbp = pag->pagb_tree.rb_node; - - /* find closest start bno overlap */ - while (rbp) { - busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); - if (bno < busyp->bno) { - /* may overlap, but exact start block is lower */ - if (bno + len > busyp->bno) - match = -1; - rbp = rbp->rb_left; - } else if (bno > busyp->bno) { - /* may overlap, but exact start block is higher */ - if (bno < busyp->bno + busyp->length) - match = -1; - rbp = rbp->rb_right; - } else { - /* bno matches busyp, length determines exact match */ - match = (busyp->length == len) ? 1 : -1; - break; - } - } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - return match; -} - -/* - * The found free extent [fbno, fend] overlaps part or all of the given busy - * extent. If the overlap covers the beginning, the end, or all of the busy - * extent, the overlapping portion can be made unbusy and used for the - * allocation. We can't split a busy extent because we can't modify a - * transaction/CIL context busy list, but we can update an entries block - * number or length. - * - * Returns true if the extent can safely be reused, or false if the search - * needs to be restarted. - */ -STATIC bool -xfs_alloc_busy_update_extent( - struct xfs_mount *mp, - struct xfs_perag *pag, - struct xfs_busy_extent *busyp, - xfs_agblock_t fbno, - xfs_extlen_t flen, - bool userdata) -{ - xfs_agblock_t fend = fbno + flen; - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - /* - * This extent is currently being discarded. Give the thread - * performing the discard a chance to mark the extent unbusy - * and retry. - */ - if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) { - spin_unlock(&pag->pagb_lock); - delay(1); - spin_lock(&pag->pagb_lock); - return false; - } - - /* - * If there is a busy extent overlapping a user allocation, we have - * no choice but to force the log and retry the search. - * - * Fortunately this does not happen during normal operation, but - * only if the filesystem is very low on space and has to dip into - * the AGFL for normal allocations. - */ - if (userdata) - goto out_force_log; - - if (bbno < fbno && bend > fend) { - /* - * Case 1: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - */ - - /* - * We would have to split the busy extent to be able to track - * it correct, which we cannot do because we would have to - * modify the list of busy extents attached to the transaction - * or CIL context, which is immutable. - * - * Force out the log to clear the busy extent and retry the - * search. - */ - goto out_force_log; - } else if (bbno >= fbno && bend <= fend) { - /* - * Case 2: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------+ - * fbno fend - * - * Case 3: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Case 4: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Case 5: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------------------------+ - * fbno fend - * - */ - - /* - * The busy extent is fully covered by the extent we are - * allocating, and can simply be removed from the rbtree. - * However we cannot remove it from the immutable list - * tracking busy extents in the transaction or CIL context, - * so set the length to zero to mark it invalid. - * - * We also need to restart the busy extent search from the - * tree root, because erasing the node can rearrange the - * tree topology. - */ - rb_erase(&busyp->rb_node, &pag->pagb_tree); - busyp->length = 0; - return false; - } else if (fend < bend) { - /* - * Case 6: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - * - * Case 7: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +------------------+ - * fbno fend - * - */ - busyp->bno = fend; - } else if (bbno < fbno) { - /* - * Case 8: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 9: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +----------------------+ - * fbno fend - */ - busyp->length = fbno - busyp->bno; - } else { - ASSERT(0); - } - - trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen); - return true; - -out_force_log: - spin_unlock(&pag->pagb_lock); - xfs_log_force(mp, XFS_LOG_SYNC); - trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen); - spin_lock(&pag->pagb_lock); - return false; -} - - -/* - * For a given extent [fbno, flen], make sure we can reuse it safely. - */ -void -xfs_alloc_busy_reuse( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agblock_t fbno, - xfs_extlen_t flen, - bool userdata) -{ - struct xfs_perag *pag; - struct rb_node *rbp; - - ASSERT(flen > 0); - - pag = xfs_perag_get(mp, agno); - spin_lock(&pag->pagb_lock); -restart: - rbp = pag->pagb_tree.rb_node; - while (rbp) { - struct xfs_busy_extent *busyp = - rb_entry(rbp, struct xfs_busy_extent, rb_node); - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - if (fbno + flen <= bbno) { - rbp = rbp->rb_left; - continue; - } else if (fbno >= bend) { - rbp = rbp->rb_right; - continue; - } - - if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen, - userdata)) - goto restart; - } - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); -} - -/* - * For a given extent [fbno, flen], search the busy extent list to find a - * subset of the extent that is not busy. If *rlen is smaller than - * args->minlen no suitable extent could be found, and the higher level - * code needs to force out the log and retry the allocation. - */ -STATIC void -xfs_alloc_busy_trim( - struct xfs_alloc_arg *args, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_agblock_t *rbno, - xfs_extlen_t *rlen) -{ - xfs_agblock_t fbno; - xfs_extlen_t flen; - struct rb_node *rbp; - - ASSERT(len > 0); - - spin_lock(&args->pag->pagb_lock); -restart: - fbno = bno; - flen = len; - rbp = args->pag->pagb_tree.rb_node; - while (rbp && flen >= args->minlen) { - struct xfs_busy_extent *busyp = - rb_entry(rbp, struct xfs_busy_extent, rb_node); - xfs_agblock_t fend = fbno + flen; - xfs_agblock_t bbno = busyp->bno; - xfs_agblock_t bend = bbno + busyp->length; - - if (fend <= bbno) { - rbp = rbp->rb_left; - continue; - } else if (fbno >= bend) { - rbp = rbp->rb_right; - continue; - } - - /* - * If this is a metadata allocation, try to reuse the busy - * extent instead of trimming the allocation. - */ - if (!args->userdata && - !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) { - if (!xfs_alloc_busy_update_extent(args->mp, args->pag, - busyp, fbno, flen, - false)) - goto restart; - continue; - } - - if (bbno <= fbno) { - /* start overlap */ - - /* - * Case 1: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +---------+ - * fbno fend - * - * Case 2: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 3: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-------------+ - * fbno fend - * - * Case 4: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------+ - * fbno fend - * - * No unbusy region in extent, return failure. - */ - if (fend <= bend) - goto fail; - - /* - * Case 5: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +----------------------+ - * fbno fend - * - * Case 6: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Needs to be trimmed to: - * +-------+ - * fbno fend - */ - fbno = bend; - } else if (bend >= fend) { - /* end overlap */ - - /* - * Case 7: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +------------------+ - * fbno fend - * - * Case 8: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +--------------------------+ - * fbno fend - * - * Needs to be trimmed to: - * +-------+ - * fbno fend - */ - fend = bbno; - } else { - /* middle overlap */ - - /* - * Case 9: - * bbno bend - * +BBBBBBBBBBBBBBBBB+ - * +-----------------------------------+ - * fbno fend - * - * Can be trimmed to: - * +-------+ OR +-------+ - * fbno fend fbno fend - * - * Backward allocation leads to significant - * fragmentation of directories, which degrades - * directory performance, therefore we always want to - * choose the option that produces forward allocation - * patterns. - * Preferring the lower bno extent will make the next - * request use "fend" as the start of the next - * allocation; if the segment is no longer busy at - * that point, we'll get a contiguous allocation, but - * even if it is still busy, we will get a forward - * allocation. - * We try to avoid choosing the segment at "bend", - * because that can lead to the next allocation - * taking the segment at "fbno", which would be a - * backward allocation. We only use the segment at - * "fbno" if it is much larger than the current - * requested size, because in that case there's a - * good chance subsequent allocations will be - * contiguous. - */ - if (bbno - fbno >= args->maxlen) { - /* left candidate fits perfect */ - fend = bbno; - } else if (fend - bend >= args->maxlen * 4) { - /* right candidate has enough free space */ - fbno = bend; - } else if (bbno - fbno >= args->minlen) { - /* left candidate fits minimum requirement */ - fend = bbno; - } else { - goto fail; - } - } - - flen = fend - fbno; - } - spin_unlock(&args->pag->pagb_lock); - - if (fbno != bno || flen != len) { - trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, - fbno, flen); - } - *rbno = fbno; - *rlen = flen; - return; -fail: - /* - * Return a zero extent length as failure indications. All callers - * re-check if the trimmed extent satisfies the minlen requirement. - */ - spin_unlock(&args->pag->pagb_lock); - trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0); - *rbno = fbno; - *rlen = 0; -} - -static void -xfs_alloc_busy_clear_one( - struct xfs_mount *mp, - struct xfs_perag *pag, - struct xfs_busy_extent *busyp) -{ - if (busyp->length) { - trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno, - busyp->length); - rb_erase(&busyp->rb_node, &pag->pagb_tree); - } - - list_del_init(&busyp->list); - kmem_free(busyp); -} - -/* - * Remove all extents on the passed in list from the busy extents tree. - * If do_discard is set skip extents that need to be discarded, and mark - * these as undergoing a discard operation instead. - */ -void -xfs_alloc_busy_clear( - struct xfs_mount *mp, - struct list_head *list, - bool do_discard) -{ - struct xfs_busy_extent *busyp, *n; - struct xfs_perag *pag = NULL; - xfs_agnumber_t agno = NULLAGNUMBER; - - list_for_each_entry_safe(busyp, n, list, list) { - if (busyp->agno != agno) { - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); - agno = busyp->agno; - } - - if (do_discard && busyp->length && - !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD)) - busyp->flags = XFS_ALLOC_BUSY_DISCARDED; - else - xfs_alloc_busy_clear_one(mp, pag, busyp); - } - - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } -} - -/* - * Callback for list_sort to sort busy extents by the AG they reside in. - */ -int -xfs_busy_extent_ag_cmp( - void *priv, - struct list_head *a, - struct list_head *b) -{ - return container_of(a, struct xfs_busy_extent, list)->agno - - container_of(b, struct xfs_busy_extent, list)->agno; -} diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 3a7e7d8..93be4a6 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -23,7 +23,6 @@ struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; -struct xfs_busy_extent; extern struct workqueue_struct *xfs_alloc_wq; @@ -139,33 +138,6 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag); -#ifdef __KERNEL__ -void -xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); - -void -xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list, - bool do_discard); - -int -xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t bno, xfs_extlen_t len); - -void -xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); - -int -xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b); - -static inline void xfs_alloc_busy_sort(struct list_head *list) -{ - list_sort(NULL, list, xfs_busy_extent_ag_cmp); -} - -#endif /* __KERNEL__ */ - /* * Compute and fill in value of m_ag_maxlevels. */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index ffb3386..f1647ca 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -32,6 +30,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -94,7 +93,7 @@ xfs_allocbt_alloc_block( return 0; } - xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -119,8 +118,8 @@ xfs_allocbt_free_block( if (error) return error; - xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, - XFS_ALLOC_BUSY_SKIP_DISCARD); + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0dbb9e7..ae31c31 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -16,9 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_trans.h" @@ -29,7 +27,6 @@ #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -623,7 +620,7 @@ xfs_map_at_offset( * or delayed allocate extent. */ STATIC int -xfs_is_delayed_page( +xfs_check_page_type( struct page *page, unsigned int type) { @@ -637,11 +634,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IO_UNWRITTEN); + acceptable += (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IO_DELALLOC); + acceptable += (type == IO_DELALLOC); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IO_OVERWRITE); + acceptable += (type == IO_OVERWRITE); else break; } while ((bh = bh->b_this_page) != head); @@ -684,7 +681,7 @@ xfs_convert_page( goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; - if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) + if (!xfs_check_page_type(page, (*ioendp)->io_type)) goto fail_unlock_page; /* @@ -834,7 +831,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_is_delayed_page(page, IO_DELALLOC)) + if (!xfs_check_page_type(page, IO_DELALLOC)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1146,7 +1143,14 @@ __xfs_get_blocks( if (!create && direct && offset >= i_size_read(inode)) return 0; - if (create) { + /* + * Direct I/O is usually done on preallocated files, so try getting + * a block mapping without an exclusive lock first. For buffered + * writes we already have the exclusive iolock anyway, so avoiding + * a lock roundtrip here by taking the ilock exclusive from the + * beginning is a useful micro optimization. + */ + if (create && !direct) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { @@ -1168,23 +1172,45 @@ __xfs_get_blocks( (!nimaps || (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK))) { - if (direct) { + if (direct || xfs_get_extsz_hint(ip)) { + /* + * Drop the ilock in preparation for starting the block + * allocation transaction. It will be retaken + * exclusively inside xfs_iomap_write_direct for the + * actual allocation. + */ + xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset, size, &imap, nimaps); + if (error) + return -error; + new = 1; } else { + /* + * Delalloc reservations do not require a transaction, + * we can go on without dropping the lock here. If we + * are allocating a new delalloc block, make sure that + * we set the new flag so that we mark the buffer new so + * that we know that it is newly allocated if the write + * fails. + */ + if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + new = 1; error = xfs_iomap_write_delay(ip, offset, size, &imap); + if (error) + goto out_unlock; + + xfs_iunlock(ip, lockmode); } - if (error) - goto out_unlock; trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); } else if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } - xfs_iunlock(ip, lockmode); if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK) { @@ -1386,52 +1412,91 @@ out_destroy_ioend: return ret; } +/* + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have made it to disk yet + * as the page is still locked at this point. + */ +STATIC void +xfs_vm_kill_delalloc_range( + struct inode *inode, + loff_t start, + loff_t end) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + start_fsb = XFS_B_TO_FSB(ip->i_mount, start); + end_fsb = XFS_B_TO_FSB(ip->i_mount, end); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + STATIC void xfs_vm_write_failed( - struct address_space *mapping, - loff_t to) + struct inode *inode, + struct page *page, + loff_t pos, + unsigned len) { - struct inode *inode = mapping->host; + loff_t block_offset = pos & PAGE_MASK; + loff_t block_start; + loff_t block_end; + loff_t from = pos & (PAGE_CACHE_SIZE - 1); + loff_t to = from + len; + struct buffer_head *bh, *head; - if (to > inode->i_size) { - /* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have - * made it to disk yet as the page is still locked at this - * point. - */ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; + ASSERT(block_offset + from == pos); - truncate_pagecache(inode, to, inode->i_size); + head = page_buffers(page); + block_start = 0; + for (bh = head; bh != head || !block_start; + bh = bh->b_this_page, block_start = block_end, + block_offset += bh->b_size) { + block_end = block_start + bh->b_size; - /* - * Check if there are any blocks that are outside of i_size - * that need to be trimmed back. - */ - start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; - end_fsb = XFS_B_TO_FSB(ip->i_mount, to); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* skip buffers before the write */ + if (block_end <= from) + continue; + + /* if the buffer is after the write, we're done */ + if (block_start >= to) + break; + + if (!buffer_delay(bh)) + continue; + + if (!buffer_new(bh) && block_offset < i_size_read(inode)) + continue; + + xfs_vm_kill_delalloc_range(inode, block_offset, + block_offset + bh->b_size); } + } +/* + * This used to call block_write_begin(), but it unlocks and releases the page + * on error, and we need that page to be able to punch stale delalloc blocks out + * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at + * the appropriate point. + */ STATIC int xfs_vm_write_begin( struct file *file, @@ -1442,15 +1507,40 @@ xfs_vm_write_begin( struct page **pagep, void **fsdata) { - int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; - ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS, - pagep, xfs_get_blocks); - if (unlikely(ret)) - xfs_vm_write_failed(mapping, pos + len); - return ret; + ASSERT(len <= PAGE_CACHE_SIZE); + + page = grab_cache_page_write_begin(mapping, index, + flags | AOP_FLAG_NOFS); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (unlikely(status)) { + struct inode *inode = mapping->host; + + xfs_vm_write_failed(inode, page, pos, len); + unlock_page(page); + + if (pos + len > i_size_read(inode)) + truncate_pagecache(inode, pos + len, i_size_read(inode)); + + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; } +/* + * On failure, we only need to kill delalloc blocks beyond EOF because they + * will never be written. For blocks within EOF, generic_write_end() zeros them + * so they are safe to leave alone and be written with all the other valid data. + */ STATIC int xfs_vm_write_end( struct file *file, @@ -1463,9 +1553,19 @@ xfs_vm_write_end( { int ret; + ASSERT(len <= PAGE_CACHE_SIZE); + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) - xfs_vm_write_failed(mapping, pos + len); + if (unlikely(ret < len)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + loff_t to = pos + len; + + if (to > isize) { + truncate_pagecache(inode, to, isize); + xfs_vm_kill_delalloc_range(inode, isize, to); + } + } return ret; } diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 65d61b9..a17ff01 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -21,7 +21,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -39,7 +38,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" -#include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" @@ -1987,14 +1985,12 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XBF_LOCK | XBF_DONT_BLOCK, - &bp); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, blkcnt, 0, &bp); if (error) return(error); - tmp = (valuelen < XFS_BUF_SIZE(bp)) - ? valuelen : XFS_BUF_SIZE(bp); + tmp = min_t(int, valuelen, BBTOB(bp->b_length)); xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ); xfs_buf_relse(bp); dst += tmp; @@ -2097,6 +2093,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) lblkno = args->rmtblkno; valuelen = args->valuelen; while (valuelen > 0) { + int buflen; + /* * Try to remember where we decided to put the value. */ @@ -2114,15 +2112,16 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, - XBF_LOCK | XBF_DONT_BLOCK); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); if (!bp) return ENOMEM; - tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : - XFS_BUF_SIZE(bp); + + buflen = BBTOB(bp->b_length); + tmp = min_t(int, valuelen, buflen); xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE); - if (tmp < XFS_BUF_SIZE(bp)) - xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); + if (tmp < buflen) + xfs_buf_zero(bp, tmp, buflen - tmp); + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 76d93dc..7d89d80 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -2983,7 +2982,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, map.br_blockcount); bp = xfs_trans_get_buf(*trans, dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, XBF_LOCK); + dblkno, dblkcnt, 0); if (!bp) return ENOMEM; xfs_trans_binval(*trans, bp); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 85e7e32..58b815e 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -41,7 +41,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_attr_leaf.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" @@ -4527,7 +4526,7 @@ out_unreserve_blocks: xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, alen, 0, rt ? + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); return error; } @@ -5621,8 +5620,20 @@ xfs_getbmap( XFS_FSB_TO_BB(mp, map[i].br_blockcount); out[cur_ext].bmv_unused1 = 0; out[cur_ext].bmv_unused2 = 0; - ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || - (map[i].br_startblock != DELAYSTARTBLOCK)); + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + if (map[i].br_startblock == HOLESTARTBLOCK && whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ @@ -6157,3 +6168,16 @@ next_block: return error; } + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 89ee672..803b56d 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -211,6 +211,9 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index e2f5d59..862084a 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 1f19f03..e53e317 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6819b51..172d3cc 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -35,14 +35,12 @@ #include <linux/freezer.h> #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" static kmem_zone_t *xfs_buf_zone; -STATIC int xfsbufd(void *); static struct workqueue_struct *xfslogd_workqueue; @@ -57,11 +55,7 @@ static struct workqueue_struct *xfslogd_workqueue; #endif #define xb_to_gfp(flags) \ - ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ - ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) - -#define xb_to_km(flags) \ - (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) static inline int @@ -71,11 +65,11 @@ xfs_buf_is_vmapped( /* * Return true if the buffer is vmapped. * - * The XBF_MAPPED flag is set if the buffer should be mapped, but the - * code is clever enough to know it doesn't have to map a single page, - * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. + * b_addr is null if the buffer is not mapped, but the code is clever + * enough to know it doesn't have to map a single page, so the check has + * to be both for b_addr and bp->b_page_count > 1. */ - return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; + return bp->b_addr && bp->b_page_count > 1; } static inline int @@ -144,8 +138,17 @@ void xfs_buf_stale( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_STALE; - xfs_buf_delwri_dequeue(bp); + + /* + * Clear the delwri status so that a delwri queue walker will not + * flush this buffer to disk now that it is stale. The delwri queue has + * a reference to the buffer, so this is safe to do. + */ + bp->b_flags &= ~_XBF_DELWRI_Q; + atomic_set(&(bp)->b_lru_ref, 0); if (!list_empty(&bp->b_lru)) { struct xfs_buftarg *btp = bp->b_target; @@ -164,22 +167,22 @@ xfs_buf_stale( struct xfs_buf * xfs_buf_alloc( struct xfs_buftarg *target, - xfs_off_t range_base, - size_t range_length, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; - bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); + bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); if (unlikely(!bp)) return NULL; /* - * We don't want certain flags to appear in b_flags. + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. */ - flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); + flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); @@ -189,14 +192,22 @@ xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; - bp->b_file_offset = range_base; + /* - * Set buffer_length and count_desired to the same value initially. - * I/O routines should use count_desired, which will be the same in + * Set length and io_length to the same value initially. + * I/O routines should use io_length, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - bp->b_buffer_length = bp->b_count_desired = range_length; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_flags = flags; + + /* + * We do not set the block number here in the buffer because we have not + * finished initialising the buffer. We insert the buffer into the cache + * in this state, so this ensures that we are unable to do IO on a + * buffer that hasn't been fully initialised. + */ bp->b_bn = XFS_BUF_DADDR_NULL; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -219,13 +230,12 @@ _xfs_buf_get_pages( { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { - bp->b_offset = xfs_buf_poff(bp->b_file_offset); bp->b_page_count = page_count; if (page_count <= XB_PAGES) { bp->b_pages = bp->b_page_array; } else { bp->b_pages = kmem_alloc(sizeof(struct page *) * - page_count, xb_to_km(flags)); + page_count, KM_NOFS); if (bp->b_pages == NULL) return -ENOMEM; } @@ -288,11 +298,11 @@ xfs_buf_allocate_memory( xfs_buf_t *bp, uint flags) { - size_t size = bp->b_count_desired; + size_t size; size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; - xfs_off_t end; + xfs_off_t start, end; int error; /* @@ -300,15 +310,15 @@ xfs_buf_allocate_memory( * the memory from the heap - there's no need for the complexity of * page arrays to keep allocation down to order 0. */ - if (bp->b_buffer_length < PAGE_SIZE) { - bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + bp->b_addr = kmem_alloc(size, KM_NOFS); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; } - if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & - PAGE_MASK) != + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != ((unsigned long)bp->b_addr & PAGE_MASK)) { /* b_addr spans two pages - use alloc_page instead */ kmem_free(bp->b_addr); @@ -319,13 +329,14 @@ xfs_buf_allocate_memory( bp->b_pages = bp->b_page_array; bp->b_pages[0] = virt_to_page(bp->b_addr); bp->b_page_count = 1; - bp->b_flags |= XBF_MAPPED | _XBF_KMEM; + bp->b_flags |= _XBF_KMEM; return 0; } use_alloc_page: - end = bp->b_file_offset + bp->b_buffer_length; - page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); + start = BBTOB(bp->b_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count = end - start; error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; @@ -388,8 +399,9 @@ _xfs_buf_map_pages( if (bp->b_page_count == 1) { /* A single page buffer is always mappable */ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; - bp->b_flags |= XBF_MAPPED; - } else if (flags & XBF_MAPPED) { + } else if (flags & XBF_UNMAPPED) { + bp->b_addr = NULL; + } else { int retried = 0; do { @@ -403,7 +415,6 @@ _xfs_buf_map_pages( if (!bp->b_addr) return -ENOMEM; bp->b_addr += bp->b_offset; - bp->b_flags |= XBF_MAPPED; } return 0; @@ -420,29 +431,27 @@ _xfs_buf_map_pages( */ xfs_buf_t * _xfs_buf_find( - xfs_buftarg_t *btp, /* block device target */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + struct xfs_buftarg *btp, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, xfs_buf_t *new_bp) { - xfs_off_t range_base; - size_t range_length; + size_t numbytes; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent; xfs_buf_t *bp; - range_base = (ioff << BBSHIFT); - range_length = (isize << BBSHIFT); + numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(range_length < (1 << btp->bt_sshift))); - ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < (1 << btp->bt_sshift))); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, ioff)); + xfs_daddr_to_agno(btp->bt_mount, blkno)); /* walk tree */ spin_lock(&pag->pag_buf_lock); @@ -453,20 +462,20 @@ _xfs_buf_find( parent = *rbp; bp = rb_entry(parent, struct xfs_buf, b_rbnode); - if (range_base < bp->b_file_offset) + if (blkno < bp->b_bn) rbp = &(*rbp)->rb_left; - else if (range_base > bp->b_file_offset) + else if (blkno > bp->b_bn) rbp = &(*rbp)->rb_right; else { /* - * found a block offset match. If the range doesn't + * found a block number match. If the range doesn't * match, the only way this is allowed is if the buffer * in the cache is stale and the transaction that made * it stale has not yet committed. i.e. we are * reallocating a busy extent. Skip this buffer and * continue searching to the right for an exact match. */ - if (bp->b_buffer_length != range_length) { + if (bp->b_length != numblks) { ASSERT(bp->b_flags & XBF_STALE); rbp = &(*rbp)->rb_right; continue; @@ -511,7 +520,7 @@ found: */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -526,63 +535,59 @@ found: */ struct xfs_buf * xfs_buf_get( - xfs_buftarg_t *target,/* target for buffer */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ + xfs_buftarg_t *target, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { struct xfs_buf *bp; struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, ioff, isize, flags, NULL); + bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); if (likely(bp)) goto found; - new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, - flags); + new_bp = xfs_buf_alloc(target, blkno, numblks, flags); if (unlikely(!new_bp)) return NULL; - bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); - if (!bp) { + error = xfs_buf_allocate_memory(new_bp, flags); + if (error) { kmem_zone_free(xfs_buf_zone, new_bp); return NULL; } - if (bp == new_bp) { - error = xfs_buf_allocate_memory(bp, flags); - if (error) - goto no_buffer; - } else - kmem_zone_free(xfs_buf_zone, new_bp); + bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); + if (!bp) { + xfs_buf_free(new_bp); + return NULL; + } + + if (bp != new_bp) + xfs_buf_free(new_bp); /* * Now we have a workable buffer, fill in the block number so * that we can do IO on it. */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; + bp->b_bn = blkno; + bp->b_io_length = bp->b_length; found: - if (!(bp->b_flags & XBF_MAPPED)) { + if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } } XFS_STATS_INC(xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; - -no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } STATIC int @@ -590,32 +595,30 @@ _xfs_buf_read( xfs_buf_t *bp, xfs_buf_flags_t flags) { - int status; - - ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); + ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); - bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - status = xfs_buf_iorequest(bp); - if (status || bp->b_error || (flags & XBF_ASYNC)) - return status; + xfs_buf_iorequest(bp); + if (flags & XBF_ASYNC) + return 0; return xfs_buf_iowait(bp); } xfs_buf_t * xfs_buf_read( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize, + xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags) { xfs_buf_t *bp; flags |= XBF_READ; - bp = xfs_buf_get(target, ioff, isize, flags); + bp = xfs_buf_get(target, blkno, numblks, flags); if (bp) { trace_xfs_buf_read(bp, flags, _RET_IP_); @@ -627,7 +630,8 @@ xfs_buf_read( * Read ahead call which is already satisfied, * drop the buffer */ - goto no_buffer; + xfs_buf_relse(bp); + return NULL; } else { /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; @@ -635,12 +639,6 @@ xfs_buf_read( } return bp; - - no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; } /* @@ -650,14 +648,14 @@ xfs_buf_read( void xfs_buf_readahead( xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize) + xfs_daddr_t blkno, + size_t numblks) { if (bdi_read_congested(target->bt_bdi)) return; - xfs_buf_read(target, ioff, isize, - XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); + xfs_buf_read(target, blkno, numblks, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); } /* @@ -666,16 +664,15 @@ xfs_buf_readahead( */ struct xfs_buf * xfs_buf_read_uncached( - struct xfs_mount *mp, struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t length, + size_t numblks, int flags) { xfs_buf_t *bp; int error; - bp = xfs_buf_get_uncached(target, length, flags); + bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) return NULL; @@ -683,9 +680,9 @@ xfs_buf_read_uncached( XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); - xfsbdstrat(mp, bp); + xfsbdstrat(target->bt_mount, bp); error = xfs_buf_iowait(bp); - if (error || bp->b_error) { + if (error) { xfs_buf_relse(bp); return NULL; } @@ -699,7 +696,7 @@ xfs_buf_read_uncached( void xfs_buf_set_empty( struct xfs_buf *bp, - size_t len) + size_t numblks) { if (bp->b_pages) _xfs_buf_free_pages(bp); @@ -707,10 +704,9 @@ xfs_buf_set_empty( bp->b_pages = NULL; bp->b_page_count = 0; bp->b_addr = NULL; - bp->b_file_offset = 0; - bp->b_buffer_length = bp->b_count_desired = len; + bp->b_length = numblks; + bp->b_io_length = numblks; bp->b_bn = XFS_BUF_DADDR_NULL; - bp->b_flags &= ~XBF_MAPPED; } static inline struct page * @@ -749,7 +745,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); + rval = _xfs_buf_get_pages(bp, page_count, 0); if (rval) return rval; @@ -760,9 +756,8 @@ xfs_buf_associate_memory( pageaddr += PAGE_SIZE; } - bp->b_count_desired = len; - bp->b_buffer_length = buflen; - bp->b_flags |= XBF_MAPPED; + bp->b_io_length = BTOBB(len); + bp->b_length = BTOBB(buflen); return 0; } @@ -770,17 +765,18 @@ xfs_buf_associate_memory( xfs_buf_t * xfs_buf_get_uncached( struct xfs_buftarg *target, - size_t len, + size_t numblks, int flags) { - unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + unsigned long page_count; int error, i; xfs_buf_t *bp; - bp = xfs_buf_alloc(target, 0, len, 0); + bp = xfs_buf_alloc(target, 0, numblks, 0); if (unlikely(bp == NULL)) goto fail; + page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; error = _xfs_buf_get_pages(bp, page_count, 0); if (error) goto fail_free_buf; @@ -792,7 +788,7 @@ xfs_buf_get_uncached( } bp->b_flags |= _XBF_PAGES; - error = _xfs_buf_map_pages(bp, XBF_MAPPED); + error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages\n", __func__); @@ -855,7 +851,7 @@ xfs_buf_rele( spin_unlock(&pag->pag_buf_lock); } else { xfs_buf_lru_del(bp); - ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); @@ -915,13 +911,6 @@ xfs_buf_lock( trace_xfs_buf_lock_done(bp, _RET_IP_); } -/* - * Releases the lock on the buffer object. - * If the buffer is marked delwri but is not queued, do so before we - * unlock the buffer as we need to set flags correctly. We also need to - * take a reference for the delwri queue because the unlocker is going to - * drop their's and they don't know we just queued it. - */ void xfs_buf_unlock( struct xfs_buf *bp) @@ -1008,9 +997,8 @@ xfs_buf_ioerror_alert( const char *func) { xfs_alert(bp->b_target->bt_mount, -"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", - (__uint64_t)XFS_BUF_ADDR(bp), func, - bp->b_error, XFS_BUF_COUNT(bp)); +"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", + (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); } int @@ -1019,10 +1007,11 @@ xfs_bwrite( { int error; + ASSERT(xfs_buf_islocked(bp)); + bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); - xfs_buf_delwri_dequeue(bp); xfs_bdstrat_cb(bp); error = xfs_buf_iowait(bp); @@ -1181,7 +1170,7 @@ _xfs_buf_ioapply( int rw, map_i, total_nr_pages, nr_pages; struct bio *bio; int offset = bp->b_offset; - int size = bp->b_count_desired; + int size = BBTOB(bp->b_io_length); sector_t sector = bp->b_bn; total_nr_pages = bp->b_page_count; @@ -1229,7 +1218,7 @@ next_chunk: break; offset = 0; - sector += nbytes >> BBSHIFT; + sector += BTOBB(nbytes); size -= nbytes; total_nr_pages--; } @@ -1248,13 +1237,13 @@ next_chunk: } } -int +void xfs_buf_iorequest( xfs_buf_t *bp) { trace_xfs_buf_iorequest(bp, _RET_IP_); - ASSERT(!(bp->b_flags & XBF_DELWRI)); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); @@ -1269,13 +1258,12 @@ xfs_buf_iorequest( _xfs_buf_ioend(bp, 0); xfs_buf_rele(bp); - return 0; } /* - * Waits for I/O to complete on the buffer supplied. - * It returns immediately if no I/O is pending. - * It returns the I/O error code, if any, or 0 if there was no error. + * Waits for I/O to complete on the buffer supplied. It returns immediately if + * no I/O is pending or there is already a pending error on the buffer. It + * returns the I/O error code, if any, or 0 if there was no error. */ int xfs_buf_iowait( @@ -1283,7 +1271,8 @@ xfs_buf_iowait( { trace_xfs_buf_iowait(bp, _RET_IP_); - wait_for_completion(&bp->b_iowait); + if (!bp->b_error) + wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); return bp->b_error; @@ -1296,7 +1285,7 @@ xfs_buf_offset( { struct page *page; - if (bp->b_flags & XBF_MAPPED) + if (bp->b_addr) return bp->b_addr + offset; offset += bp->b_offset; @@ -1315,27 +1304,30 @@ xfs_buf_iomove( void *data, /* data address */ xfs_buf_rw_t mode) /* read/write/zero flag */ { - size_t bend, cpoff, csize; - struct page *page; + size_t bend; bend = boff + bsize; while (boff < bend) { - page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; - cpoff = xfs_buf_poff(boff + bp->b_offset); - csize = min_t(size_t, - PAGE_SIZE-cpoff, bp->b_count_desired-boff); + struct page *page; + int page_index, page_offset, csize; + + page_index = (boff + bp->b_offset) >> PAGE_SHIFT; + page_offset = (boff + bp->b_offset) & ~PAGE_MASK; + page = bp->b_pages[page_index]; + csize = min_t(size_t, PAGE_SIZE - page_offset, + BBTOB(bp->b_io_length) - boff); - ASSERT(((csize + cpoff) <= PAGE_SIZE)); + ASSERT((csize + page_offset) <= PAGE_SIZE); switch (mode) { case XBRW_ZERO: - memset(page_address(page) + cpoff, 0, csize); + memset(page_address(page) + page_offset, 0, csize); break; case XBRW_READ: - memcpy(data, page_address(page) + cpoff, csize); + memcpy(data, page_address(page) + page_offset, csize); break; case XBRW_WRITE: - memcpy(page_address(page) + cpoff, data, csize); + memcpy(page_address(page) + page_offset, data, csize); } boff += csize; @@ -1435,11 +1427,9 @@ xfs_free_buftarg( { unregister_shrinker(&btp->bt_shrinker); - xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - kthread_stop(btp->bt_task); kmem_free(btp); } @@ -1491,20 +1481,6 @@ xfs_setsize_buftarg( return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); } -STATIC int -xfs_alloc_delwri_queue( - xfs_buftarg_t *btp, - const char *fsname) -{ - INIT_LIST_HEAD(&btp->bt_delwri_queue); - spin_lock_init(&btp->bt_delwri_lock); - btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); - if (IS_ERR(btp->bt_task)) - return PTR_ERR(btp->bt_task); - return 0; -} - xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, @@ -1527,8 +1503,6 @@ xfs_alloc_buftarg( spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_alloc_delwri_queue(btp, fsname)) - goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; btp->bt_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&btp->bt_shrinker); @@ -1539,125 +1513,52 @@ error: return NULL; } - /* - * Delayed write buffer handling + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that + * neither this routine nor the buffer list submission functions perform + * any internal synchronization. It is expected that the lists are thread-local + * to the callers. + * + * Returns true if we queued up the buffer, or false if it already had + * been on the buffer list. */ -void +bool xfs_buf_delwri_queue( - xfs_buf_t *bp) + struct xfs_buf *bp, + struct list_head *list) { - struct xfs_buftarg *btp = bp->b_target; - - trace_xfs_buf_delwri_queue(bp, _RET_IP_); - + ASSERT(xfs_buf_islocked(bp)); ASSERT(!(bp->b_flags & XBF_READ)); - spin_lock(&btp->bt_delwri_lock); - if (!list_empty(&bp->b_list)) { - /* if already in the queue, move it to the tail */ - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_move_tail(&bp->b_list, &btp->bt_delwri_queue); - } else { - /* start xfsbufd as it is about to have something to do */ - if (list_empty(&btp->bt_delwri_queue)) - wake_up_process(bp->b_target->bt_task); - - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; - list_add_tail(&bp->b_list, &btp->bt_delwri_queue); - } - bp->b_queuetime = jiffies; - spin_unlock(&btp->bt_delwri_lock); -} - -void -xfs_buf_delwri_dequeue( - xfs_buf_t *bp) -{ - int dequeued = 0; - - spin_lock(&bp->b_target->bt_delwri_lock); - if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_del_init(&bp->b_list); - dequeued = 1; + /* + * If the buffer is already marked delwri it already is queued up + * by someone else for imediate writeout. Just ignore it in that + * case. + */ + if (bp->b_flags & _XBF_DELWRI_Q) { + trace_xfs_buf_delwri_queued(bp, _RET_IP_); + return false; } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(&bp->b_target->bt_delwri_lock); - - if (dequeued) - xfs_buf_rele(bp); - - trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); -} - -/* - * If a delwri buffer needs to be pushed before it has aged out, then promote - * it to the head of the delwri queue so that it will be flushed on the next - * xfsbufd run. We do this by resetting the queuetime of the buffer to be older - * than the age currently needed to flush the buffer. Hence the next time the - * xfsbufd sees it is guaranteed to be considered old enough to flush. - */ -void -xfs_buf_delwri_promote( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; - ASSERT(bp->b_flags & XBF_DELWRI); - ASSERT(bp->b_flags & _XBF_DELWRI_Q); + trace_xfs_buf_delwri_queue(bp, _RET_IP_); /* - * Check the buffer age before locking the delayed write queue as we - * don't need to promote buffers that are already past the flush age. + * If a buffer gets written out synchronously or marked stale while it + * is on a delwri list we lazily remove it. To do this, the other party + * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. + * It remains referenced and on the list. In a rare corner case it + * might get readded to a delwri list after the synchronous writeout, in + * which case we need just need to re-add the flag here. */ - if (bp->b_queuetime < jiffies - age) - return; - bp->b_queuetime = jiffies - age; - spin_lock(&btp->bt_delwri_lock); - list_move(&bp->b_list, &btp->bt_delwri_queue); - spin_unlock(&btp->bt_delwri_lock); -} - -/* - * Move as many buffers as specified to the supplied list - * idicating if we skipped any buffers to prevent deadlocks. - */ -STATIC int -xfs_buf_delwri_split( - xfs_buftarg_t *target, - struct list_head *list, - unsigned long age) -{ - xfs_buf_t *bp, *n; - int skipped = 0; - int force; - - force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); - INIT_LIST_HEAD(list); - spin_lock(&target->bt_delwri_lock); - list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { - ASSERT(bp->b_flags & XBF_DELWRI); - - if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { - if (!force && - time_before(jiffies, bp->b_queuetime + age)) { - xfs_buf_unlock(bp); - break; - } - - bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; - list_move_tail(&bp->b_list, list); - trace_xfs_buf_delwri_split(bp, _RET_IP_); - } else - skipped++; + bp->b_flags |= _XBF_DELWRI_Q; + if (list_empty(&bp->b_list)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_list, list); } - spin_unlock(&target->bt_delwri_lock); - return skipped; + return true; } /* @@ -1683,99 +1584,109 @@ xfs_buf_cmp( return 0; } -STATIC int -xfsbufd( - void *data) +static int +__xfs_buf_delwri_submit( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) { - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - - current->flags |= PF_MEMALLOC; - - set_freezable(); + struct blk_plug plug; + struct xfs_buf *bp, *n; + int pinned = 0; + + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!wait) { + if (xfs_buf_ispinned(bp)) { + pinned++; + continue; + } + if (!xfs_buf_trylock(bp)) + continue; + } else { + xfs_buf_lock(bp); + } - do { - long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); - struct list_head tmp; - struct blk_plug plug; + /* + * Someone else might have written the buffer synchronously or + * marked it stale in the meantime. In that case only the + * _XBF_DELWRI_Q flag got cleared, and we have to drop the + * reference and remove it from the list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + continue; + } - if (unlikely(freezing(current))) - try_to_freeze(); + list_move_tail(&bp->b_list, io_list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } - /* sleep for a long time if there is nothing to do. */ - if (list_empty(&target->bt_delwri_queue)) - tout = MAX_SCHEDULE_TIMEOUT; - schedule_timeout_interruptible(tout); + list_sort(NULL, io_list, xfs_buf_cmp); - xfs_buf_delwri_split(target, &tmp, age); - list_sort(NULL, &tmp, xfs_buf_cmp); + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, io_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags |= XBF_WRITE; - blk_start_plug(&plug); - while (!list_empty(&tmp)) { - struct xfs_buf *bp; - bp = list_first_entry(&tmp, struct xfs_buf, b_list); + if (!wait) { + bp->b_flags |= XBF_ASYNC; list_del_init(&bp->b_list); - xfs_bdstrat_cb(bp); } - blk_finish_plug(&plug); - } while (!kthread_should_stop()); + xfs_bdstrat_cb(bp); + } + blk_finish_plug(&plug); - return 0; + return pinned; } /* - * Go through all incore buffers, and release buffers if they belong to - * the given device. This is used in filesystem error handling to - * preserve the consistency of its metadata. + * Write out a buffer list asynchronously. + * + * This will take the @buffer_list, write all non-locked and non-pinned buffers + * out and not wait for I/O completion on any of the buffers. This interface + * is only safely useable for callers that can track I/O completion by higher + * level means, e.g. AIL pushing as the @buffer_list is consumed in this + * function. */ int -xfs_flush_buftarg( - xfs_buftarg_t *target, - int wait) +xfs_buf_delwri_submit_nowait( + struct list_head *buffer_list) { - xfs_buf_t *bp; - int pincount = 0; - LIST_HEAD(tmp_list); - LIST_HEAD(wait_list); - struct blk_plug plug; + LIST_HEAD (io_list); + return __xfs_buf_delwri_submit(buffer_list, &io_list, false); +} - flush_workqueue(xfslogd_workqueue); +/* + * Write out a buffer list synchronously. + * + * This will take the @buffer_list, write all buffers out and wait for I/O + * completion on all of the buffers. @buffer_list is consumed by the function, + * so callers must have some other way of tracking buffers if they require such + * functionality. + */ +int +xfs_buf_delwri_submit( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + int error = 0, error2; + struct xfs_buf *bp; - set_bit(XBT_FORCE_FLUSH, &target->bt_flags); - pincount = xfs_buf_delwri_split(target, &tmp_list, 0); + __xfs_buf_delwri_submit(buffer_list, &io_list, true); - /* - * Dropped the delayed write list lock, now walk the temporary list. - * All I/O is issued async and then if we need to wait for completion - * we do that after issuing all the IO. - */ - list_sort(NULL, &tmp_list, xfs_buf_cmp); + /* Wait for IO to complete. */ + while (!list_empty(&io_list)) { + bp = list_first_entry(&io_list, struct xfs_buf, b_list); - blk_start_plug(&plug); - while (!list_empty(&tmp_list)) { - bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); - ASSERT(target == bp->b_target); list_del_init(&bp->b_list); - if (wait) { - bp->b_flags &= ~XBF_ASYNC; - list_add(&bp->b_list, &wait_list); - } - xfs_bdstrat_cb(bp); - } - blk_finish_plug(&plug); - - if (wait) { - /* Wait for IO to complete. */ - while (!list_empty(&wait_list)) { - bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - - list_del_init(&bp->b_list); - xfs_buf_iowait(bp); - xfs_buf_relse(bp); - } + error2 = xfs_buf_iowait(bp); + xfs_buf_relse(bp); + if (!error) + error = error2; } - return pincount; + return error; } int __init diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5bf3be4..7f1d139 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -32,11 +32,6 @@ #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) -#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) -#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) - typedef enum { XBRW_READ = 1, /* transfer into target memory */ XBRW_WRITE = 2, /* transfer from target memory */ @@ -46,11 +41,9 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ -#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -58,14 +51,13 @@ typedef enum { #define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ /* flags used only as arguments to access routines */ -#define XBF_LOCK (1 << 15)/* lock requested */ #define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ -#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ /* flags used only internally */ #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ typedef unsigned int xfs_buf_flags_t; @@ -73,25 +65,18 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ - { XBF_MAPPED, "MAPPED" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ - { XBF_DELWRI, "DELWRI" }, \ { XBF_STALE, "STALE" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ - { XBF_LOCK, "LOCK" }, /* should never be set */\ - { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ - { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" } -typedef enum { - XBT_FORCE_FLUSH = 0, -} xfs_buftarg_flags_t; - typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; @@ -101,12 +86,6 @@ typedef struct xfs_buftarg { unsigned int bt_sshift; size_t bt_smask; - /* per device delwri queue */ - struct task_struct *bt_task; - struct list_head bt_delwri_queue; - spinlock_t bt_delwri_lock; - unsigned long bt_flags; - /* LRU control structures */ struct shrinker bt_shrinker; struct list_head bt_lru; @@ -128,8 +107,8 @@ typedef struct xfs_buf { * fast-path on locking. */ struct rb_node b_rbnode; /* rbtree node */ - xfs_off_t b_file_offset; /* offset in file */ - size_t b_buffer_length;/* size of buffer in bytes */ + xfs_daddr_t b_bn; /* block number for I/O */ + int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ @@ -140,8 +119,6 @@ typedef struct xfs_buf { struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ - xfs_daddr_t b_bn; /* block number for I/O */ - size_t b_count_desired;/* desired transfer size */ void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; xfs_buf_iodone_t b_iodone; /* I/O completion function */ @@ -150,7 +127,7 @@ typedef struct xfs_buf { struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ - unsigned long b_queuetime; /* time buffer was queued */ + int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ @@ -163,26 +140,30 @@ typedef struct xfs_buf { /* Finding and Reading Buffers */ -extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t, xfs_buf_t *); +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags, + struct xfs_buf *new_bp); #define xfs_incore(buftarg,blkno,len,lockit) \ _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) -extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); -extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); - -struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *, xfs_off_t, size_t, - xfs_buf_flags_t); -extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); -extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); -extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); -extern void xfs_buf_hold(xfs_buf_t *); -extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t); -struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, - struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t length, int flags); +struct xfs_buf *xfs_buf_get(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +void xfs_buf_readahead(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks); + +struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); +struct xfs_buf *xfs_buf_alloc(struct xfs_buftarg *target, xfs_daddr_t blkno, + size_t numblks, xfs_buf_flags_t flags); +void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); +int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); + +struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + int flags); +struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, + xfs_daddr_t daddr, size_t numblks, int flags); +void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ extern void xfs_buf_free(xfs_buf_t *); @@ -204,7 +185,7 @@ extern int xfs_bdstrat_cb(struct xfs_buf *); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); -extern int xfs_buf_iorequest(xfs_buf_t *); +extern void xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); @@ -220,24 +201,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp) extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_queue(struct xfs_buf *); -extern void xfs_buf_delwri_dequeue(struct xfs_buf *); -extern void xfs_buf_delwri_promote(struct xfs_buf *); +extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +extern int xfs_buf_delwri_submit(struct list_head *); +extern int xfs_buf_delwri_submit_nowait(struct list_head *); /* Buffer Daemon Setup Routines */ extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ - ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) -#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) - #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) @@ -256,12 +235,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_ADDR(bp) ((bp)->b_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) -#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) -#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) -#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) -#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt)) -#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) -#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { @@ -287,7 +260,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); -extern int xfs_flush_buftarg(xfs_buftarg_t *, int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index eac97ef..45df2b8 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -123,11 +122,11 @@ xfs_buf_item_log_check( ASSERT(bip->bli_logged != NULL); bp = bip->bli_buf; - ASSERT(XFS_BUF_COUNT(bp) > 0); + ASSERT(bp->b_length > 0); ASSERT(bp->b_addr != NULL); orig = bip->bli_orig; buffer = bp->b_addr; - for (x = 0; x < XFS_BUF_COUNT(bp); x++) { + for (x = 0; x < BBTOB(bp->b_length); x++) { if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { xfs_emerg(bp->b_mount, "%s: bip %x buffer %x orig %x index %d", @@ -418,7 +417,6 @@ xfs_buf_item_unpin( if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); @@ -455,42 +453,42 @@ xfs_buf_item_unpin( bp->b_iodone = NULL; } else { spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_fspriv == NULL); } xfs_buf_relse(bp); + } else if (freed && remove) { + xfs_buf_lock(bp); + xfs_buf_ioerror(bp, EIO); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioend(bp, 0); } } -/* - * This is called to attempt to lock the buffer associated with this - * buf log item. Don't sleep on the buffer lock. If we can't get - * the lock right away, return 0. If we can get the lock, take a - * reference to the buffer. If this is a delayed write buffer that - * needs AIL help to be written back, invoke the pushbuf routine - * rather than the normal success path. - */ STATIC uint -xfs_buf_item_trylock( - struct xfs_log_item *lip) +xfs_buf_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + uint rval = XFS_ITEM_SUCCESS; if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; - /* take a reference to the buffer. */ - xfs_buf_hold(bp); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - trace_xfs_buf_item_trylock(bip); - if (XFS_BUF_ISDELAYWRITE(bp)) - return XFS_ITEM_PUSHBUF; - return XFS_ITEM_SUCCESS; + + trace_xfs_buf_item_push(bip); + + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_unlock(bp); + return rval; } /* @@ -603,49 +601,6 @@ xfs_buf_item_committed( return lsn; } -/* - * The buffer is locked, but is not a delayed write buffer. This happens - * if we race with IO completion and hence we don't want to try to write it - * again. Just release the buffer. - */ -STATIC void -xfs_buf_item_push( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_push(bip); - - xfs_buf_relse(bp); -} - -/* - * The buffer is locked and is a delayed write buffer. Promote the buffer - * in the delayed write queue as the caller knows that they must invoke - * the xfsbufd to get this buffer written. We have to unlock the buffer - * to allow the xfsbufd to write it, too. - */ -STATIC bool -xfs_buf_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_buf_log_item *bip = BUF_ITEM(lip); - struct xfs_buf *bp = bip->bli_buf; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(XFS_BUF_ISDELAYWRITE(bp)); - - trace_xfs_buf_item_pushbuf(bip); - - xfs_buf_delwri_promote(bp); - xfs_buf_relse(bp); - return true; -} - STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, @@ -661,11 +616,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, .iop_unpin = xfs_buf_item_unpin, - .iop_trylock = xfs_buf_item_trylock, .iop_unlock = xfs_buf_item_unlock, .iop_committed = xfs_buf_item_committed, .iop_push = xfs_buf_item_push, - .iop_pushbuf = xfs_buf_item_pushbuf, .iop_committing = xfs_buf_item_committing }; @@ -703,7 +656,8 @@ xfs_buf_item_init( * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); + chunks = (int)((BBTOB(bp->b_length) + (XFS_BLF_CHUNK - 1)) >> + XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, @@ -713,7 +667,7 @@ xfs_buf_item_init( xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); + bip->bli_format.blf_len = (ushort)bp->b_length; bip->bli_format.blf_map_size = map_size; #ifdef XFS_TRANS_DEBUG @@ -725,9 +679,9 @@ xfs_buf_item_init( * the buffer to indicate which bytes the callers have asked * to have logged. */ - bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp)); - bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); + bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP); + memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length)); + bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP); #endif /* @@ -984,20 +938,27 @@ xfs_buf_iodone_callbacks( * If the write was asynchronous then no one will be looking for the * error. Clear the error state and write the buffer out again. * - * During sync or umount we'll write all pending buffers again - * synchronous, which will catch these errors if they keep hanging - * around. + * XXX: This helps against transient write errors, but we need to find + * a way to shut the filesystem down if the writes keep failing. + * + * In practice we'll shut the filesystem down soon as non-transient + * erorrs tend to affect the whole device and a failing log write + * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { + ASSERT(bp->b_iodone != NULL); + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { - xfs_buf_delwri_queue(bp); - XFS_BUF_DONE(bp); + bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + xfs_bdstrat_cb(bp); + } else { + xfs_buf_relse(bp); } - ASSERT(bp->b_iodone != NULL); - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - xfs_buf_relse(bp); + return; } @@ -1045,6 +1006,6 @@ xfs_buf_iodone( * Either way, AIL is useless if we're forcing a shutdown. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 7f1a6f5..015b946 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -2277,20 +2276,20 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) if (nbuf == 1) { dabuf->nbuf = 1; bp = bps[0]; - dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); + dabuf->bbcount = bp->b_length; dabuf->data = bp->b_addr; dabuf->bps[0] = bp; } else { dabuf->nbuf = nbuf; for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { dabuf->bps[i] = bp = bps[i]; - dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); + dabuf->bbcount += bp->b_length; } dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); - for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { + for (i = off = 0; i < nbuf; i++, off += BBTOB(bp->b_length)) { bp = bps[i]; memcpy((char *)dabuf->data + off, bp->b_addr, - XFS_BUF_COUNT(bp)); + BBTOB(bp->b_length)); } } return dabuf; @@ -2310,10 +2309,10 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf) ASSERT(dabuf->nbuf > 1); dabuf->dirty = 0; for (i = off = 0; i < dabuf->nbuf; - i++, off += XFS_BUF_COUNT(bp)) { + i++, off += BBTOB(bp->b_length)) { bp = dabuf->bps[i]; memcpy(bp->b_addr, dabuf->data + off, - XFS_BUF_COUNT(bp)); + BBTOB(bp->b_length)); } } } @@ -2356,10 +2355,10 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) } dabuf->dirty = 1; ASSERT(first <= last); - for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { + for (i = off = 0; i < dabuf->nbuf; i++, off += BBTOB(bp->b_length)) { bp = dabuf->bps[i]; f = off; - l = f + XFS_BUF_COUNT(bp) - 1; + l = f + BBTOB(bp->b_length) - 1; if (f < first) f = first; if (l > last) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 1137bbc..e00de08 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index a2e2701..67a250c36 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index d3b63ae..586732f 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 5bbe2a8..2046988 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 66e108f..397ffbc 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 0179a41..b0f2678 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 79d05e8..19bf0c5 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 1ad3a4b..f9c3fe3 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -30,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_extent_busy.h" #include "xfs_discard.h" #include "xfs_trace.h" @@ -118,7 +118,7 @@ xfs_trim_extents( * If any blocks in the range are still busy, skip the * discard and try again the next time. */ - if (xfs_alloc_busy_search(mp, agno, fbno, flen)) { + if (xfs_extent_busy_search(mp, agno, fbno, flen)) { trace_xfs_discard_busy(mp, agno, fbno, flen); goto next_extent; } @@ -212,7 +212,7 @@ xfs_discard_extents( struct xfs_mount *mp, struct list_head *list) { - struct xfs_busy_extent *busyp; + struct xfs_extent_busy *busyp; int error = 0; list_for_each_entry(busyp, list, list) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1155208..bf27fcca 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -857,7 +856,7 @@ xfs_qm_dqflush_done( /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); if (lip->li_lsn == qip->qli_flush_lsn) - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); else spin_unlock(&ailp->xa_lock); } @@ -878,8 +877,8 @@ xfs_qm_dqflush_done( */ int xfs_qm_dqflush( - xfs_dquot_t *dqp, - uint flags) + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; @@ -891,25 +890,30 @@ xfs_qm_dqflush( trace_xfs_dqflush(dqp); - /* - * If not dirty, or it's pinned and we are not supposed to block, nada. - */ - if (!XFS_DQ_IS_DIRTY(dqp) || - ((flags & SYNC_TRYLOCK) && atomic_read(&dqp->q_pincount) > 0)) { - xfs_dqfunlock(dqp); - return 0; - } + *bpp = NULL; + xfs_qm_dqunpin_wait(dqp); /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this dquot - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an emptry AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; dqp->dq_flags &= ~XFS_DQ_DIRTY; - xfs_dqfunlock(dqp); - return XFS_ERROR(EIO); + + spin_lock(&mp->m_ail->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(mp->m_ail, lip, + SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&mp->m_ail->xa_lock); + error = XFS_ERROR(EIO); + goto out_unlock; } /* @@ -917,11 +921,8 @@ xfs_qm_dqflush( */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); - if (error) { - ASSERT(error != ENOENT); - xfs_dqfunlock(dqp); - return error; - } + if (error) + goto out_unlock; /* * Calculate the location of the dquot inside the buffer. @@ -967,20 +968,13 @@ xfs_qm_dqflush( xfs_log_force(mp, 0); } - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - trace_xfs_dqflush_done(dqp); + *bpp = bp; + return 0; - /* - * dqp is still locked, but caller is free to unlock it now. - */ - return error; - +out_unlock: + xfs_dqfunlock(dqp); + return XFS_ERROR(EIO); } /* @@ -1011,39 +1005,6 @@ xfs_dqlock2( } } -/* - * Give the buffer a little push if it is incore and - * wait on the flush lock. - */ -void -xfs_dqflock_pushbuf_wait( - xfs_dquot_t *dqp) -{ - xfs_mount_t *mp = dqp->q_mount; - xfs_buf_t *bp; - - /* - * Check to see if the dquot has been flushed delayed - * write. If so, grab its buffer and send it - * out immediately. We'll be able to acquire - * the flush lock when the I/O completes. - */ - bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); - if (!bp) - goto out_lock; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - xfs_buf_delwri_promote(bp); - wake_up_process(bp->b_target->bt_task); - } - xfs_buf_relse(bp); -out_lock: - xfs_dqflock(dqp); -} - int __init xfs_qm_init(void) { diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index ef9190b..7d20af2 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -141,7 +141,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); -extern int xfs_qm_dqflush(xfs_dquot_t *, uint); +extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); @@ -152,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); -extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 34baeae..57aa4b0 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -108,38 +106,6 @@ xfs_qm_dquot_logitem_unpin( wake_up(&dqp->q_pinwait); } -/* - * Given the logitem, this writes the corresponding dquot entry to disk - * asynchronously. This is called with the dquot entry securely locked; - * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot - * at the end. - */ -STATIC void -xfs_qm_dquot_logitem_push( - struct xfs_log_item *lip) -{ - struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - int error; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(!completion_done(&dqp->q_flush)); - - /* - * Since we were able to lock the dquot's flush lock and - * we found it on the AIL, the dquot must be dirty. This - * is because the dquot is removed from the AIL while still - * holding the flush lock in xfs_dqflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the dquot. - */ - error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); - if (error) - xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", - __func__, error, dqp); - xfs_dqunlock(dqp); -} - STATIC xfs_lsn_t xfs_qm_dquot_logitem_committed( struct xfs_log_item *lip, @@ -171,67 +137,15 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } -/* - * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that - * the dquot is locked by us, but the flush lock isn't. So, here we are - * going to see if the relevant dquot buffer is incore, waiting on DELWRI. - * If so, we want to push it out to help us take this item off the AIL as soon - * as possible. - * - * We must not be holding the AIL lock at this point. Calling incore() to - * search the buffer cache can be a time consuming thing, and AIL lock is a - * spinlock. - */ -STATIC bool -xfs_qm_dquot_logitem_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - struct xfs_dquot *dqp = qlip->qli_dquot; - struct xfs_buf *bp; - bool ret = true; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - /* - * If flushlock isn't locked anymore, chances are that the - * inode flush completed and the inode was taken off the AIL. - * So, just get out. - */ - if (completion_done(&dqp->q_flush) || - !(lip->li_flags & XFS_LI_IN_AIL)) { - xfs_dqunlock(dqp); - return true; - } - - bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, - dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); - xfs_dqunlock(dqp); - if (!bp) - return true; - if (XFS_BUF_ISDELAYWRITE(bp)) - xfs_buf_delwri_promote(bp); - if (xfs_buf_ispinned(bp)) - ret = false; - xfs_buf_relse(bp); - return ret; -} - -/* - * This is called to attempt to lock the dquot associated with this - * dquot log item. Don't sleep on the dquot lock or the flush lock. - * If the flush lock is already held, indicating that the dquot has - * been or is in the process of being flushed, then see if we can - * find the dquot's buffer in the buffer cache without sleeping. If - * we can and it is marked delayed write, then we want to send it out. - * We delay doing so until the push routine, though, to avoid sleeping - * in any device strategy routines. - */ STATIC uint -xfs_qm_dquot_logitem_trylock( - struct xfs_log_item *lip) +xfs_qm_dquot_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; @@ -239,16 +153,41 @@ xfs_qm_dquot_logitem_trylock( if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; + /* + * Re-check the pincount now that we stabilized the value by + * taking the quota lock. + */ + if (atomic_read(&dqp->q_pincount) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the dquot. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ if (!xfs_dqflock_nowait(dqp)) { - /* - * dquot has already been flushed to the backing buffer, - * leave it locked, pushbuf routine will unlock it. - */ - return XFS_ITEM_PUSHBUF; + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } - ASSERT(lip->li_flags & XFS_LI_IN_AIL); - return XFS_ITEM_SUCCESS; + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + __func__, error, dqp); + } else { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_dqunlock(dqp); + return rval; } /* @@ -299,11 +238,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, .iop_unpin = xfs_qm_dquot_logitem_unpin, - .iop_trylock = xfs_qm_dquot_logitem_trylock, .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf, .iop_committing = xfs_qm_dquot_logitem_committing }; @@ -398,11 +335,13 @@ xfs_qm_qoff_logitem_unpin( } /* - * Quotaoff items have no locking, so just return success. + * There isn't much you can do to push a quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. */ STATIC uint -xfs_qm_qoff_logitem_trylock( - struct xfs_log_item *lip) +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_LOCKED; } @@ -429,17 +368,6 @@ xfs_qm_qoff_logitem_committed( return lsn; } -/* - * There isn't much you can do to push on an quotaoff item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC void -xfs_qm_qoff_logitem_push( - struct xfs_log_item *lip) -{ -} - - STATIC xfs_lsn_t xfs_qm_qoffend_logitem_committed( struct xfs_log_item *lip, @@ -454,7 +382,7 @@ xfs_qm_qoffend_logitem_committed( * xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); + xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); kmem_free(qfs); kmem_free(qfe); @@ -487,7 +415,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_trylock = xfs_qm_qoff_logitem_trylock, .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, @@ -502,7 +429,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_trylock = xfs_qm_qoff_logitem_trylock, .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoff_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 39f0633..6104560 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 558910f..2d25d19 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_types.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c new file mode 100644 index 0000000..85e9f87a --- /dev/null +++ b/fs/xfs/xfs_extent_busy.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_inode.h" +#include "xfs_extent_busy.h" +#include "xfs_trace.h" + +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + struct xfs_extent_busy *new; + struct xfs_extent_busy *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent = NULL; + + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); + xfs_trans_set_sync(tp); + return; + } + + new->agno = agno; + new->bno = bno; + new->length = len; + INIT_LIST_HEAD(&new->list); + new->flags = flags; + + /* trace before insert to be able to see failed inserts */ + trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + + pag = xfs_perag_get(tp->t_mountp, new->agno); + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + while (*rbp) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); + + if (new->bno < busyp->bno) { + rbp = &(*rbp)->rb_left; + ASSERT(new->bno + new->length <= busyp->bno); + } else if (new->bno > busyp->bno) { + rbp = &(*rbp)->rb_right; + ASSERT(bno >= busyp->bno + busyp->length); + } else { + ASSERT(0); + } + } + + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_extent_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +int +xfs_extent_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_extent_busy *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + return match; +} + +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entries block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_extent_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ +void +xfs_extent_busy_reuse( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + + ASSERT(flen > 0); + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +void +xfs_extent_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { + if (!xfs_extent_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +STATIC void +xfs_extent_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp) +{ + if (busyp->length) { + trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); + kmem_free(busyp); +} + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_extent_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_extent_busy *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + else + xfs_extent_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_extent_busy_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_extent_busy, list)->agno - + container_of(b, struct xfs_extent_busy, list)->agno; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h new file mode 100644 index 0000000..985412d --- /dev/null +++ b/fs/xfs/xfs_extent_busy.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTENT_BUSY_H__ +#define __XFS_EXTENT_BUSY_H__ + +/* + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_extent_busy_insert() for details. + */ +struct xfs_extent_busy { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + unsigned int flags; +#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ +}; + +void +xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + +void +xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); + +int +xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +void +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, + xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); + +int +xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_extent_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_extent_busy_ag_cmp); +} + +#endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 35c2aff..feb36d7 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_sb.h" @@ -64,7 +63,8 @@ __xfs_efi_release( if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) { spin_lock(&ailp->xa_lock); /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, &efip->efi_item); + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } @@ -147,22 +147,20 @@ xfs_efi_item_unpin( } /* - * Efi items have no locking or pushing. However, since EFIs are - * pulled from the AIL when their corresponding EFDs are committed - * to disk, their situation is very similar to being pinned. Return - * XFS_ITEM_PINNED so that the caller will eventually flush the log. - * This should help in getting the EFI out of the AIL. + * Efi items have no locking or pushing. However, since EFIs are pulled from + * the AIL when their corresponding EFDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the EFI out of + * the AIL. */ STATIC uint -xfs_efi_item_trylock( - struct xfs_log_item *lip) +xfs_efi_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_PINNED; } -/* - * Efi items have no locking, so just return. - */ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) @@ -190,17 +188,6 @@ xfs_efi_item_committed( } /* - * There isn't much you can do to push on an efi item. It is simply - * stuck waiting for all of its corresponding efd items to be - * committed to disk. - */ -STATIC void -xfs_efi_item_push( - struct xfs_log_item *lip) -{ -} - -/* * The EFI dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For @@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, .iop_unpin = xfs_efi_item_unpin, - .iop_trylock = xfs_efi_item_trylock, .iop_unlock = xfs_efi_item_unlock, .iop_committed = xfs_efi_item_committed, .iop_push = xfs_efi_item_push, @@ -404,19 +390,17 @@ xfs_efd_item_unpin( } /* - * Efd items have no locking, so just return success. + * There isn't much you can do to push on an efd item. It is simply stuck + * waiting for the log to be flushed to disk. */ STATIC uint -xfs_efd_item_trylock( - struct xfs_log_item *lip) +xfs_efd_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { - return XFS_ITEM_LOCKED; + return XFS_ITEM_PINNED; } -/* - * Efd items have no locking or pushing, so return failure - * so that the caller doesn't bother with us. - */ STATIC void xfs_efd_item_unlock( struct xfs_log_item *lip) @@ -451,16 +435,6 @@ xfs_efd_item_committed( } /* - * There isn't much you can do to push on an efd item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC void -xfs_efd_item_push( - struct xfs_log_item *lip) -{ -} - -/* * The EFD dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For @@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, .iop_unpin = xfs_efd_item_unpin, - .iop_trylock = xfs_efd_item_trylock, .iop_unlock = xfs_efd_item_unlock, .iop_committed = xfs_efd_item_committed, .iop_push = xfs_efd_item_push, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 54a67dd..8d214b8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_trans.h" @@ -396,114 +394,96 @@ xfs_file_splice_write( } /* - * This routine is called to handle zeroing any space in the last - * block of the file that is beyond the EOF. We do this since the - * size is being increased without writing anything to that block - * and we don't want anyone to read the garbage on the disk. + * This routine is called to handle zeroing any space in the last block of the + * file that is beyond the EOF. We do this since the size is being increased + * without writing anything to that block and we don't want to read the + * garbage on the disk. */ STATIC int /* error (positive) */ xfs_zero_last_block( - xfs_inode_t *ip, - xfs_fsize_t offset, - xfs_fsize_t isize) + struct xfs_inode *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) { - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = ip->i_mount; - int nimaps; - int zero_offset; - int zero_len; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - zero_offset = XFS_B_FSB_OFFSET(mp, isize); - if (zero_offset == 0) { - /* - * There are no extra bytes in the last block on disk to - * zero, so return. - */ - return 0; - } + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); + int zero_offset = XFS_B_FSB_OFFSET(mp, isize); + int zero_len; + int nimaps = 1; + int error = 0; + struct xfs_bmbt_irec imap; - last_fsb = XFS_B_TO_FSBT(mp, isize); - nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; + ASSERT(nimaps > 0); + /* * If the block underlying isize is just a hole, then there * is nothing to zero. */ - if (imap.br_startblock == HOLESTARTBLOCK) { + if (imap.br_startblock == HOLESTARTBLOCK) return 0; - } - /* - * Zero the part of the last block beyond the EOF, and write it - * out sync. We need to drop the ilock while we do this so we - * don't deadlock when the buffer cache calls back to us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; - error = xfs_iozero(ip, isize, zero_len); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; + return xfs_iozero(ip, isize, zero_len); } /* - * Zero any on disk space between the current EOF and the new, - * larger EOF. This handles the normal case of zeroing the remainder - * of the last block in the file and the unusual case of zeroing blocks - * out beyond the size of the file. This second case only happens - * with fixed size extents and when the system crashes before the inode - * size was updated but after blocks were allocated. If fill is set, - * then any holes in the range are filled and zeroed. If not, the holes - * are left alone as holes. + * Zero any on disk space between the current EOF and the new, larger EOF. + * + * This handles the normal case of zeroing the remainder of the last block in + * the file and the unusual case of zeroing blocks out beyond the size of the + * file. This second case only happens with fixed size extents and when the + * system crashes before the inode size was updated but after blocks were + * allocated. + * + * Expects the iolock to be held exclusive, and will take the ilock internally. */ - int /* error (positive) */ xfs_zero_eof( - xfs_inode_t *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ + struct xfs_inode *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + struct xfs_bmbt_irec imap; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); /* * First handle zeroing the block on which isize resides. + * * We only zero a part of that block so it is handled specially. */ - error = xfs_zero_last_block(ip, offset, isize); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; + if (XFS_B_FSB_OFFSET(mp, isize) != 0) { + error = xfs_zero_last_block(ip, offset, isize); + if (error) + return error; } /* - * Calculate the range between the new size and the old - * where blocks needing to be zeroed may exist. To get the - * block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back - * to a block boundary. We subtract 1 in case the size is - * exactly on a block boundary. + * Calculate the range between the new size and the old where blocks + * needing to be zeroed may exist. + * + * To get the block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back to a block + * boundary. We subtract 1 in case the size is exactly on a block + * boundary. */ last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); @@ -521,23 +501,18 @@ xfs_zero_eof( while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, &imap, &nimaps, 0); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } + ASSERT(nimaps > 0); if (imap.br_state == XFS_EXT_UNWRITTEN || imap.br_startblock == HOLESTARTBLOCK) { - /* - * This loop handles initializing pages that were - * partially initialized by the code below this - * loop. It basically zeroes the part of the page - * that sits on a hole and sets the page as P_HOLE - * and calls remapf if it is a mapped file. - */ start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); continue; @@ -545,11 +520,7 @@ xfs_zero_eof( /* * There are blocks we need to zero. - * Drop the inode lock while we're doing the I/O. - * We'll still have the iolock to protect us. */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); @@ -557,22 +528,14 @@ xfs_zero_eof( zero_len = offset - zero_off; error = xfs_iozero(ip, zero_off, zero_len); - if (error) { - goto out_lock; - } + if (error) + return error; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); } return 0; - -out_lock: - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; } /* @@ -593,35 +556,29 @@ xfs_file_aio_write_checks( struct xfs_inode *ip = XFS_I(inode); int error = 0; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL); restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); - if (error) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which involves - * dropping all locks and relocking to maintain correct locking order. - * If we do this, restart the function to ensure all checks and values - * are still valid. + * iolock shared, we need to update it to exclusive which implies + * having to redo all checks before. */ if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); + xfs_rw_ilock(ip, *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); + if (error) + return error; } - xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; /* * Updating the timestamps will grab the ilock again from @@ -638,7 +595,6 @@ restart: * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); - } /* @@ -1007,8 +963,149 @@ xfs_vm_page_mkwrite( return block_page_mkwrite(vma, vmf, xfs_get_blocks); } +STATIC loff_t +xfs_seek_data( + struct file *file, + loff_t start, + u32 type) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec map[2]; + int nmap = 2; + loff_t uninitialized_var(offset); + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + xfs_filblks_t end; + uint lock; + int error; + + lock = xfs_ilock_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ + end = XFS_B_TO_FSB(mp, isize); + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* + * Treat unwritten extent as data extent since it might + * contains dirty data in page cache. + */ + if (map[0].br_startblock != HOLESTARTBLOCK) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[0].br_startoff)); + } else { + if (nmap == 1) { + error = ENXIO; + goto out_unlock; + } + + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[1].br_startoff)); + } + + if (offset != file->f_pos) + file->f_pos = offset; + +out_unlock: + xfs_iunlock_map_shared(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_seek_hole( + struct file *file, + loff_t start, + u32 type) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + loff_t uninitialized_var(offset); + loff_t holeoff; + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + uint lock; + int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + lock = xfs_ilock_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK); + if (error) + goto out_unlock; + + holeoff = XFS_FSB_TO_B(mp, fsbno); + if (holeoff <= start) + offset = start; + else { + /* + * xfs_bmap_first_unused() could return a value bigger than + * isize if there are no more holes past the supplied offset. + */ + offset = min_t(loff_t, holeoff, isize); + } + + if (offset != file->f_pos) + file->f_pos = offset; + +out_unlock: + xfs_iunlock_map_shared(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int origin) +{ + switch (origin) { + case SEEK_END: + case SEEK_CUR: + case SEEK_SET: + return generic_file_llseek(file, offset, origin); + case SEEK_DATA: + return xfs_seek_data(file, offset, origin); + case SEEK_HOLE: + return xfs_seek_hole(file, offset, origin); + default: + return -EINVAL; + } +} + const struct file_operations xfs_file_operations = { - .llseek = generic_file_llseek, + .llseek = xfs_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = xfs_file_aio_read, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 1c6fdeb..c25b094 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -18,8 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -39,7 +37,6 @@ #include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" -#include "xfs_rw.h" #include "xfs_filestream.h" #include "xfs_trace.h" @@ -147,9 +144,9 @@ xfs_growfs_data_private( if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) return EIO; xfs_buf_relse(bp); @@ -193,7 +190,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; @@ -230,7 +227,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { error = ENOMEM; goto error0; @@ -259,8 +256,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -286,8 +282,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -314,8 +309,7 @@ xfs_growfs_data_private( */ bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), - XBF_LOCK | XBF_MAPPED); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) { error = ENOMEM; goto error0; @@ -405,7 +399,7 @@ xfs_growfs_data_private( /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_read_buf(mp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) { @@ -693,3 +687,63 @@ xfs_fs_goingdown( return 0; } + +/* + * Force a shutdown of the filesystem instantly while keeping the filesystem + * consistent. We don't do an unmount here; just shutdown the shop, make sure + * that absolutely nothing persistent happens to this filesystem after this + * point. + */ +void +xfs_do_force_shutdown( + xfs_mount_t *mp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + + logerror = flags & SHUTDOWN_LOG_IO_ERROR; + + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_notice(mp, + "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + __func__, flags, lnnum, fname, __return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations to tell them + * the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & SHUTDOWN_CORRUPT_INCORE) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, + "Corruption of in-memory data detected. Shutting down filesystem"); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) + xfs_stack_trace(); + } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); + } + } + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_alert(mp, + "Please umount the filesystem and rectify the problem(s)"); + } +} diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index dad1a31..177a21a 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -200,8 +200,7 @@ xfs_ialloc_inode_init( */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!fbuf) return ENOMEM; /* @@ -610,6 +609,13 @@ xfs_ialloc_get_rec( /* * Visible inode allocation functions. */ +/* + * Find a free (set) bit in the inode bitmask. + */ +static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) +{ + return xfs_lowbit64(*fp); +} /* * Allocate an inode on disk. diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 666a037..65ac57c 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -47,15 +47,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) } /* - * Find a free (set) bit in the inode bitmask. - */ -static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) -{ - return xfs_lowbit64(*fp); -} - - -/* * Allocate an inode on disk. * Mode is used to tell whether the new inode will need space, and whether * it is a directory. diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c6a7581..2b8b7a3 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index bcc6c24..1bb4365 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_acl.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -123,23 +122,7 @@ xfs_inode_free( xfs_idestroy_fork(ip, XFS_ATTR_FORK); if (ip->i_itemp) { - /* - * Only if we are shutting down the fs will we see an - * inode still in the AIL. If it is there, we should remove - * it to prevent a use-after-free from occurring. - */ - xfs_log_item_t *lip = &ip->i_itemp->ili_item; - struct xfs_ail *ailp = lip->li_ailp; - - ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, lip); - else - spin_unlock(&ailp->xa_lock); - } + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); xfs_inode_item_destroy(ip); ip->i_itemp = NULL; } @@ -334,9 +317,10 @@ xfs_iget_cache_miss( /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload - * region. + * region. Since we can be called from transaction context, don't + * recurse into the file system. */ - if (radix_tree_preload(GFP_KERNEL)) { + if (radix_tree_preload(GFP_NOFS)) { error = EAGAIN; goto out_destroy; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bc46c0a..a59eea0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -20,7 +20,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -61,6 +60,20 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + #ifdef DEBUG /* * Make sure that the extents in the given memory buffer @@ -137,6 +150,7 @@ xfs_imap_to_bp( int ni; xfs_buf_t *bp; + buf_flags |= XBF_UNMAPPED; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, (int)imap->im_len, buf_flags, &bp); if (error) { @@ -226,7 +240,7 @@ xfs_inotobp( if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); if (error) return error; @@ -782,8 +796,7 @@ xfs_iread( /* * Get pointers to the on-disk inode and the buffer containing it. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, - XBF_LOCK, iget_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); if (error) return error; dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -1342,7 +1355,7 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -1423,7 +1436,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp() returned error %d.", __func__, error); @@ -1484,7 +1497,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); if (error) { xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", __func__, error); @@ -1566,8 +1579,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XBF_LOCK); + mp->m_bsize * blks_per_cluster, 0); if (!bp) return ENOMEM; @@ -1737,7 +1749,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); if (error) return error; @@ -2347,11 +2359,11 @@ cluster_corrupt_out: */ rcu_read_unlock(); /* - * Clean up the buffer. If it was B_DELWRI, just release it -- + * Clean up the buffer. If it was delwri, just release it -- * brelse can handle it with no problems. If not, shut down the * filesystem before releasing the buffer. */ - bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); + bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); if (bufwasdelwri) xfs_buf_relse(bp); @@ -2377,30 +2389,29 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq); + xfs_iflush_abort(iq, false); kmem_free(ilist); xfs_perag_put(pag); return XFS_ERROR(EFSCORRUPTED); } /* - * xfs_iflush() will write a modified inode's changes out to the - * inode's on disk home. The caller must have the inode lock held - * in at least shared mode and the inode flush completion must be - * active as well. The inode lock will still be held upon return from - * the call and the caller is free to unlock it. - * The inode flush will be completed when the inode reaches the disk. - * The flags indicate how the inode's buffer should be written out. + * Flush dirty inode metadata into the backing buffer. + * + * The caller must have the inode lock and the inode flush lock held. The + * inode lock will still be held upon return to the caller, and the inode + * flush lock will be released after the inode has reached the disk. + * + * The caller must write out the buffer returned in *bpp and release it. */ int xfs_iflush( - xfs_inode_t *ip, - uint flags) + struct xfs_inode *ip, + struct xfs_buf **bpp) { - xfs_inode_log_item_t *iip; - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_mount_t *mp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + struct xfs_dinode *dip; int error; XFS_STATS_INC(xs_iflush_count); @@ -2410,25 +2421,8 @@ xfs_iflush( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - iip = ip->i_itemp; - mp = ip->i_mount; + *bpp = NULL; - /* - * We can't flush the inode until it is unpinned, so wait for it if we - * are allowed to block. We know no one new can pin it, because we are - * holding the inode lock shared and you need to hold it exclusively to - * pin the inode. - * - * If we are not allowed to block, force the log out asynchronously so - * that when we come back the inode will be unpinned. If other inodes - * in the same cluster are dirty, they will probably write the inode - * out for us if they occur after the log force completes. - */ - if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { - xfs_iunpin(ip); - xfs_ifunlock(ip); - return EAGAIN; - } xfs_iunpin_wait(ip); /* @@ -2447,20 +2441,20 @@ xfs_iflush( /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk! + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an empty AIL as part of the unmount process. */ if (XFS_FORCED_SHUTDOWN(mp)) { - if (iip) - iip->ili_fields = 0; - xfs_ifunlock(ip); - return XFS_ERROR(EIO); + error = XFS_ERROR(EIO); + goto abort_out; } /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, - (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); if (error || !bp) { xfs_ifunlock(ip); return error; @@ -2488,23 +2482,20 @@ xfs_iflush( if (error) goto cluster_corrupt_out; - if (flags & SYNC_WAIT) - error = xfs_bwrite(bp); - else - xfs_buf_delwri_queue(bp); - - xfs_buf_relse(bp); - return error; + *bpp = bp; + return 0; corrupt_out: xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: + error = XFS_ERROR(EFSCORRUPTED); +abort_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(ip); - return XFS_ERROR(EFSCORRUPTED); + xfs_iflush_abort(ip, false); + return error; } @@ -2706,27 +2697,6 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } -void -xfs_promote_inode( - struct xfs_inode *ip) -{ - struct xfs_buf *bp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - - bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, - ip->i_imap.im_len, XBF_TRYLOCK); - if (!bp) - return; - - if (XFS_BUF_ISDELAYWRITE(bp)) { - xfs_buf_delwri_promote(bp); - wake_up_process(ip->i_mount->m_ddev_targp->bt_task); - } - - xfs_buf_relse(bp); -} - /* * Return a pointer to the extent record at file index idx. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7fee338..1efff36 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -529,11 +529,12 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); -int xfs_iflush(xfs_inode_t *, uint); -void xfs_promote_inode(struct xfs_inode *); +int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 05d924e..6cdbf90c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -480,25 +478,16 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } -/* - * This is called to attempt to lock the inode associated with this - * inode log item, in preparation for the push routine which does the actual - * iflush. Don't sleep on the inode lock or the flush lock. - * - * If the flush lock is already held, indicating that the inode has - * been or is in the process of being flushed, then (ideally) we'd like to - * see if the inode's buffer is still incore, and if so give it a nudge. - * We delay doing so until the pushbuf routine, though, to avoid holding - * the AIL lock across a call to the blackhole which is the buffer cache. - * Also we don't want to sleep in any device strategy routines, which can happen - * if we do the subsequent bawrite in here. - */ STATIC uint -xfs_inode_item_trylock( - struct xfs_log_item *lip) +xfs_inode_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; @@ -506,30 +495,50 @@ xfs_inode_item_trylock( if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; + /* + * Re-check the pincount now that we stabilized the value by + * taking the ilock. + */ + if (xfs_ipincount(ip) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the inode. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ if (!xfs_iflock_nowait(ip)) { - /* - * inode has already been flushed to the backing buffer, - * leave it locked in shared mode, pushbuf routine will - * unlock it. - */ - return XFS_ITEM_PUSHBUF; + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } - /* Stale items should force out the iclog */ + /* + * Stale inode items should force out the iclog. + */ if (ip->i_flags & XFS_ISTALE) { xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); return XFS_ITEM_PINNED; } -#ifdef DEBUG - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ASSERT(iip->ili_fields != 0); - ASSERT(iip->ili_logged == 0); - ASSERT(lip->li_flags & XFS_LI_IN_AIL); + ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_iflush(ip, &bp); + if (!error) { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); } -#endif - return XFS_ITEM_SUCCESS; + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return rval; } /* @@ -614,86 +623,6 @@ xfs_inode_item_committed( } /* - * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK - * failed to get the inode flush lock but did get the inode locked SHARED. - * Here we're trying to see if the inode buffer is incore, and if so whether it's - * marked delayed write. If that's the case, we'll promote it and that will - * allow the caller to write the buffer by triggering the xfsbufd to run. - */ -STATIC bool -xfs_inode_item_pushbuf( - struct xfs_log_item *lip) -{ - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp; - bool ret = true; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - - /* - * If a flush is not in progress anymore, chances are that the - * inode was taken off the AIL. So, just get out. - */ - if (!xfs_isiflocked(ip) || - !(lip->li_flags & XFS_LI_IN_AIL)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return true; - } - - bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XBF_TRYLOCK); - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (!bp) - return true; - if (XFS_BUF_ISDELAYWRITE(bp)) - xfs_buf_delwri_promote(bp); - if (xfs_buf_ispinned(bp)) - ret = false; - xfs_buf_relse(bp); - return ret; -} - -/* - * This is called to asynchronously write the inode associated with this - * inode log item out to disk. The inode will already have been locked by - * a successful call to xfs_inode_item_trylock(). - */ -STATIC void -xfs_inode_item_push( - struct xfs_log_item *lip) -{ - struct xfs_inode_log_item *iip = INODE_ITEM(lip); - struct xfs_inode *ip = iip->ili_inode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - - /* - * Since we were able to lock the inode's flush lock and - * we found it on the AIL, the inode must be dirty. This - * is because the inode is removed from the AIL while still - * holding the flush lock in xfs_iflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the inode. - */ - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0); - - /* - * Push the inode to it's backing buffer. This will not remove the - * inode from the AIL - a further push will be required to trigger a - * buffer push. However, this allows all the dirty inodes to be pushed - * to the buffer before it is pushed to disk. The buffer IO completion - * will pull the inode from the AIL, mark it clean and unlock the flush - * lock. - */ - (void) xfs_iflush(ip, SYNC_TRYLOCK); - xfs_iunlock(ip, XFS_ILOCK_SHARED); -} - -/* * XXX rcc - this one really has to do something. Probably needs * to stamp in a new field in the incore inode. */ @@ -713,11 +642,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, .iop_unpin = xfs_inode_item_unpin, - .iop_trylock = xfs_inode_item_trylock, .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_pushbuf = xfs_inode_item_pushbuf, .iop_committing = xfs_inode_item_committing }; @@ -848,7 +775,8 @@ xfs_iflush_done( ASSERT(i <= need_ail); } /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ - xfs_trans_ail_delete_bulk(ailp, log_items, i); + xfs_trans_ail_delete_bulk(ailp, log_items, i, + SHUTDOWN_CORRUPT_INCORE); } @@ -869,16 +797,15 @@ xfs_iflush_done( } /* - * This is the inode flushing abort routine. It is called - * from xfs_iflush when the filesystem is shutting down to clean - * up the inode state. - * It is responsible for removing the inode item - * from the AIL if it has not been re-logged, and unlocking the inode's - * flush lock. + * This is the inode flushing abort routine. It is called from xfs_iflush when + * the filesystem is shutting down to clean up the inode state. It is + * responsible for removing the inode item from the AIL if it has not been + * re-logged, and unlocking the inode's flush lock. */ void xfs_iflush_abort( - xfs_inode_t *ip) + xfs_inode_t *ip, + bool stale) { xfs_inode_log_item_t *iip = ip->i_itemp; @@ -888,7 +815,10 @@ xfs_iflush_abort( spin_lock(&ailp->xa_lock); if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { /* xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip); + xfs_trans_ail_delete(ailp, &iip->ili_item, + stale ? + SHUTDOWN_LOG_IO_ERROR : + SHUTDOWN_CORRUPT_INCORE); } else spin_unlock(&ailp->xa_lock); } @@ -915,7 +845,7 @@ xfs_istale_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 41d61c3..376d4d0 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -165,7 +165,7 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index b253c0e..90efdaf 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -26,11 +26,6 @@ * high agno_log-agblklog-inopblog bits - 0 */ -typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */ - -#define NULLFSINO ((xfs_ino_t)-1) -#define NULLAGINO ((xfs_agino_t)-1) - struct xfs_mount; #define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 91f8ff5..3a05a41 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index a849a54..c4f2da0 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -22,9 +22,7 @@ #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 71a4645..aadfce6 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -37,7 +35,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" @@ -142,11 +139,7 @@ xfs_iomap_write_direct( int committed; int error; - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) return XFS_ERROR(error); @@ -158,7 +151,7 @@ xfs_iomap_write_direct( if ((offset + count) > XFS_ISIZE(ip)) { error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - goto error_out; + return XFS_ERROR(error); } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) @@ -190,7 +183,6 @@ xfs_iomap_write_direct( /* * Allocate and setup the transaction */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), resrtextents, @@ -199,15 +191,16 @@ xfs_iomap_write_direct( /* * Check for running out of space, note: need lock to return */ - if (error) + if (error) { xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_out; error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) - goto error1; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); @@ -224,42 +217,39 @@ xfs_iomap_write_direct( error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, imap, &nimaps, &free_list); if (error) - goto error0; + goto out_bmap_cancel; /* * Complete the transaction */ error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto error0; + goto out_bmap_cancel; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) - goto error_out; + goto out_unlock; /* * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = ENOSPC; - goto error_out; + error = XFS_ERROR(ENOSPC); + goto out_unlock; } - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) error = xfs_alert_fsblock_zero(ip, imap); - goto error_out; - } - return 0; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ +out_bmap_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); +out_trans_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - -error_out: - return XFS_ERROR(error); + goto out_unlock; } /* @@ -422,6 +412,15 @@ retry: return error; } + /* + * Make sure preallocation does not create extents beyond the range we + * actually support in this filesystem. + */ + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_maxioffset)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_maxioffset); + + ASSERT(last_fsb > offset_fsb); + nimaps = XFS_WRITE_IMAPS; error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, imap, &nimaps, XFS_BMAPI_ENTIRE); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 3011b87..1a25fd8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_acl.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -34,7 +32,6 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" -#include "xfs_rw.h" #include "xfs_attr.h" #include "xfs_buf_item.h" #include "xfs_utils.h" @@ -700,7 +697,7 @@ xfs_setattr_size( xfs_off_t oldsize, newsize; struct xfs_trans *tp; int error; - uint lock_flags; + uint lock_flags = 0; uint commit_flags = 0; trace_xfs_setattr(ip); @@ -720,10 +717,10 @@ xfs_setattr_size( ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); - lock_flags = XFS_ILOCK_EXCL; - if (!(flags & XFS_ATTR_NOLOCK)) + if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; - xfs_ilock(ip, lock_flags); + xfs_ilock(ip, lock_flags); + } oldsize = inode->i_size; newsize = iattr->ia_size; @@ -746,7 +743,7 @@ xfs_setattr_size( /* * Make sure that the dquots are attached to the inode. */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach(ip, 0); if (error) goto out_unlock; @@ -768,8 +765,6 @@ xfs_setattr_size( if (error) goto out_unlock; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - lock_flags &= ~XFS_ILOCK_EXCL; /* * We are going to log the inode size change in this transaction so diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index acc2bf2..eff577a 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6db1fef..6b965bf 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -35,7 +33,6 @@ #include "xfs_trans_priv.h" #include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_rw.h" #include "xfs_trace.h" kmem_zone_t *xfs_log_ticket_zone; @@ -916,27 +913,42 @@ xfs_log_need_covered(xfs_mount_t *mp) * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t -xlog_assign_tail_lsn( +xlog_assign_tail_lsn_locked( struct xfs_mount *mp) { - xfs_lsn_t tail_lsn; struct log *log = mp->m_log; + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn; + + assert_spin_locked(&mp->m_ail->xa_lock); /* * To make sure we always have a valid LSN for the log tail we keep * track of the last LSN which was committed in log->l_last_sync_lsn, - * and use that when the AIL was empty and xfs_ail_min_lsn returns 0. - * - * If the AIL has been emptied we also need to wake any process - * waiting for this condition. + * and use that when the AIL was empty. */ - tail_lsn = xfs_ail_min_lsn(mp->m_ail); - if (!tail_lsn) + lip = xfs_ail_min(mp->m_ail); + if (lip) + tail_lsn = lip->li_lsn; + else tail_lsn = atomic64_read(&log->l_last_sync_lsn); atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; } +xfs_lsn_t +xlog_assign_tail_lsn( + struct xfs_mount *mp) +{ + xfs_lsn_t tail_lsn; + + spin_lock(&mp->m_ail->xa_lock); + tail_lsn = xlog_assign_tail_lsn_locked(mp); + spin_unlock(&mp->m_ail->xa_lock); + + return tail_lsn; +} + /* * Return the space in the log between the tail and the head. The head * is passed in the cycle/bytes formal parms. In the special case where @@ -1172,7 +1184,7 @@ xlog_alloc_log(xfs_mount_t *mp, xlog_get_iclog_buffer_size(mp, log); error = ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, 0, log->l_iclog_size, 0); + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; @@ -1182,9 +1194,6 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_icloglock); init_waitqueue_head(&log->l_flush_wait); - /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ - ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); - iclogp = &log->l_iclog; /* * The amount of memory to allocate for the iclog structure is @@ -1204,7 +1213,7 @@ xlog_alloc_log(xfs_mount_t *mp, prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - log->l_iclog_size, 0); + BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_iclog; @@ -1224,7 +1233,7 @@ xlog_alloc_log(xfs_mount_t *mp, head->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; + iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); @@ -1475,7 +1484,7 @@ xlog_sync(xlog_t *log, } else { iclog->ic_bwritecnt = 1; } - XFS_BUF_SET_COUNT(bp, count); + bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); XFS_BUF_ASYNC(bp); @@ -1573,7 +1582,7 @@ xlog_dealloc_log(xlog_t *log) * always need to ensure that the extra buffer does not point to memory * owned by another log buffer before we free it. */ - xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size); + xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; @@ -2932,6 +2941,7 @@ xfs_log_force( { int error; + trace_xfs_log_force(mp, 0); error = _xfs_log_force(mp, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3080,6 +3090,7 @@ xfs_log_force_lsn( { int error; + trace_xfs_log_force(mp, lsn); error = _xfs_log_force_lsn(mp, lsn, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 2c622be..748d312 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -152,6 +152,7 @@ int xfs_log_mount(struct xfs_mount *mp, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); +xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_notify(struct xfs_mount *mp, struct xlog_in_core *iclog, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index d4fadbe..7d6197c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log_priv.h" @@ -29,61 +27,10 @@ #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_discard.h" /* - * Perform initial CIL structure initialisation. - */ -int -xlog_cil_init( - struct log *log) -{ - struct xfs_cil *cil; - struct xfs_cil_ctx *ctx; - - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); - if (!cil) - return ENOMEM; - - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); - if (!ctx) { - kmem_free(cil); - return ENOMEM; - } - - INIT_LIST_HEAD(&cil->xc_cil); - INIT_LIST_HEAD(&cil->xc_committing); - spin_lock_init(&cil->xc_cil_lock); - init_rwsem(&cil->xc_ctx_lock); - init_waitqueue_head(&cil->xc_commit_wait); - - INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); - ctx->sequence = 1; - ctx->cil = cil; - cil->xc_ctx = ctx; - cil->xc_current_sequence = ctx->sequence; - - cil->xc_log = log; - log->l_cilp = cil; - return 0; -} - -void -xlog_cil_destroy( - struct log *log) -{ - if (log->l_cilp->xc_ctx) { - if (log->l_cilp->xc_ctx->ticket) - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); - kmem_free(log->l_cilp->xc_ctx); - } - - ASSERT(list_empty(&log->l_cilp->xc_cil)); - kmem_free(log->l_cilp); -} - -/* * Allocate a new ticket. Failing to get a new ticket makes it really hard to * recover, so we don't allow failure here. Also, we allocate in a context that * we don't want to be issuing transactions from, so we need to tell the @@ -390,8 +337,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); - xfs_alloc_busy_sort(&ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); spin_lock(&ctx->cil->xc_cil_lock); @@ -404,7 +351,7 @@ xlog_cil_committed( ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); xfs_discard_extents(mp, &ctx->busy_extents); - xfs_alloc_busy_clear(mp, &ctx->busy_extents, false); + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); } kmem_free(ctx); @@ -426,8 +373,7 @@ xlog_cil_committed( */ STATIC int xlog_cil_push( - struct log *log, - xfs_lsn_t push_seq) + struct log *log) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -443,39 +389,36 @@ xlog_cil_push( struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t commit_lsn; + xfs_lsn_t push_seq; if (!cil) return 0; - ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); - /* - * Lock out transaction commit, but don't block for background pushes - * unless we are well over the CIL space limit. See the definition of - * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic - * used here. - */ - if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_seq && - cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) - goto out_free_ticket; - down_write(&cil->xc_ctx_lock); - } + down_write(&cil->xc_ctx_lock); ctx = cil->xc_ctx; - /* check if we've anything to push */ - if (list_empty(&cil->xc_cil)) - goto out_skip; + spin_lock(&cil->xc_cil_lock); + push_seq = cil->xc_push_seq; + ASSERT(push_seq <= ctx->sequence); - /* check for spurious background flush */ - if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + /* + * Check if we've anything to push. If there is nothing, then we don't + * move on to a new sequence number and so we have to be able to push + * this sequence again later. + */ + if (list_empty(&cil->xc_cil)) { + cil->xc_push_seq = 0; + spin_unlock(&cil->xc_cil_lock); goto out_skip; + } + spin_unlock(&cil->xc_cil_lock); + /* check for a previously pushed seqeunce */ - if (push_seq && push_seq < cil->xc_ctx->sequence) + if (push_seq < cil->xc_ctx->sequence) goto out_skip; /* @@ -629,7 +572,6 @@ restart: out_skip: up_write(&cil->xc_ctx_lock); -out_free_ticket: xfs_log_ticket_put(new_ctx->ticket); kmem_free(new_ctx); return 0; @@ -641,6 +583,82 @@ out_abort: return XFS_ERROR(EIO); } +static void +xlog_cil_push_work( + struct work_struct *work) +{ + struct xfs_cil *cil = container_of(work, struct xfs_cil, + xc_push_work); + xlog_cil_push(cil->xc_log); +} + +/* + * We need to push CIL every so often so we don't cache more than we can fit in + * the log. The limit really is that a checkpoint can't be more than half the + * log (the current checkpoint is not allowed to overwrite the previous + * checkpoint), but commit latency and memory usage limit this to a smaller + * size. + */ +static void +xlog_cil_push_background( + struct log *log) +{ + struct xfs_cil *cil = log->l_cilp; + + /* + * The cil won't be empty because we are called while holding the + * context lock so whatever we added to the CIL will still be there + */ + ASSERT(!list_empty(&cil->xc_cil)); + + /* + * don't do a background push if we haven't used up all the + * space available yet. + */ + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + return; + + spin_lock(&cil->xc_cil_lock); + if (cil->xc_push_seq < cil->xc_current_sequence) { + cil->xc_push_seq = cil->xc_current_sequence; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + } + spin_unlock(&cil->xc_cil_lock); + +} + +static void +xlog_cil_push_foreground( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + + if (!cil) + return; + + ASSERT(push_seq && push_seq <= cil->xc_current_sequence); + + /* start on any pending background push to minimise wait time on it */ + flush_work(&cil->xc_push_work); + + /* + * If the CIL is empty or we've already pushed the sequence then + * there's no work we need to do. + */ + spin_lock(&cil->xc_cil_lock); + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + spin_unlock(&cil->xc_cil_lock); + return; + } + + cil->xc_push_seq = push_seq; + spin_unlock(&cil->xc_cil_lock); + + /* do the push now */ + xlog_cil_push(log); +} + /* * Commit a transaction with the given vector to the Committed Item List. * @@ -667,7 +685,6 @@ xfs_log_commit_cil( { struct log *log = mp->m_log; int log_flags = 0; - int push = 0; struct xfs_log_vec *log_vector; if (flags & XFS_TRANS_RELEASE_LOG_RES) @@ -719,21 +736,9 @@ xfs_log_commit_cil( */ xfs_trans_free_items(tp, *commit_lsn, 0); - /* check for background commit before unlock */ - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) - push = 1; + xlog_cil_push_background(log); up_read(&log->l_cilp->xc_ctx_lock); - - /* - * We need to push CIL every so often so we don't cache more than we - * can fit in the log. The limit really is that a checkpoint can't be - * more than half the log (the current checkpoint is not allowed to - * overwrite the previous checkpoint), but commit latency and memory - * usage limit this to a smaller size in most cases. - */ - if (push) - xlog_cil_push(log, 0); return 0; } @@ -746,9 +751,6 @@ xfs_log_commit_cil( * * We return the current commit lsn to allow the callers to determine if a * iclog flush is necessary following this call. - * - * XXX: Initially, just push the CIL unconditionally and return whatever - * commit lsn is there. It'll be empty, so this is broken for now. */ xfs_lsn_t xlog_cil_force_lsn( @@ -766,8 +768,7 @@ xlog_cil_force_lsn( * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ - if (sequence == cil->xc_current_sequence) - xlog_cil_push(log, sequence); + xlog_cil_push_foreground(log, sequence); /* * See if we can find a previous sequence still committing. @@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt( return false; return true; } + +/* + * Perform initial CIL structure initialisation. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_commit_wait); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 2152900..735ff1e 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -417,6 +417,8 @@ struct xfs_cil { struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; + xfs_lsn_t xc_push_seq; }; /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8ecad5b..ca38690 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -40,7 +40,6 @@ #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" #include "xfs_quota.h" -#include "xfs_rw.h" #include "xfs_utils.h" #include "xfs_trace.h" @@ -120,7 +119,7 @@ xlog_get_bp( nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0); + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); if (bp) xfs_buf_unlock(bp); return bp; @@ -146,7 +145,7 @@ xlog_align( { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(offset + nbblks <= bp->b_length); return bp->b_addr + BBTOB(offset); } @@ -174,11 +173,12 @@ xlog_bread_noalign( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); @@ -218,7 +218,7 @@ xlog_bread_offset( xfs_caddr_t offset) { xfs_caddr_t orig_offset = bp->b_addr; - int orig_len = bp->b_buffer_length; + int orig_len = BBTOB(bp->b_length); int error, error2; error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); @@ -259,13 +259,14 @@ xlog_bwrite( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); xfs_buf_hold(bp); xfs_buf_lock(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; error = xfs_bwrite(bp); if (error) @@ -440,6 +441,8 @@ xlog_find_verify_cycle( * a log sector, or we're out of luck. */ bufblks = 1 << ffs(nbblks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) @@ -1225,6 +1228,8 @@ xlog_write_log_records( * log sector, or we're out of luck. */ bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) @@ -1772,7 +1777,7 @@ xlog_recover_do_inode_buffer( trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1814,7 +1819,8 @@ xlog_recover_do_inode_buffer( ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the @@ -1873,8 +1879,8 @@ xlog_recover_do_reg_buffer( ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -2103,6 +2109,7 @@ xlog_recover_do_dquot_buffer( STATIC int xlog_recover_buffer_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; @@ -2123,9 +2130,9 @@ xlog_recover_buffer_pass2( trace_xfs_log_recover_buf_recover(log, buf_f); - buf_flags = XBF_LOCK; - if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) - buf_flags |= XBF_MAPPED; + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags); @@ -2166,14 +2173,14 @@ xlog_recover_buffer_pass2( */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); } xfs_buf_relse(bp); @@ -2183,6 +2190,7 @@ xlog_recover_buffer_pass2( STATIC int xlog_recover_inode_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_inode_log_format_t *in_f; @@ -2220,8 +2228,7 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XBF_LOCK); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); if (!bp) { error = ENOMEM; goto error; @@ -2436,7 +2443,7 @@ xlog_recover_inode_pass2( write_inode_buffer: ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); error: if (need_free) @@ -2477,6 +2484,7 @@ xlog_recover_quotaoff_pass1( STATIC int xlog_recover_dquot_pass2( xlog_t *log, + struct list_head *buffer_list, xlog_recover_item_t *item) { xfs_mount_t *mp = log->l_mp; @@ -2530,14 +2538,11 @@ xlog_recover_dquot_pass2( return XFS_ERROR(EIO); ASSERT(dq_f->qlf_len == 1); - error = xfs_read_buf(mp, mp->m_ddev_targp, - dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), - 0, &bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); + if (error) return error; - } + ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); @@ -2558,7 +2563,7 @@ xlog_recover_dquot_pass2( ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); return (0); @@ -2642,7 +2647,8 @@ xlog_recover_efd_pass2( * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); xfs_efi_item_free(efip); spin_lock(&ailp->xa_lock); break; @@ -2712,21 +2718,22 @@ STATIC int xlog_recover_commit_pass2( struct log *log, struct xlog_recover *trans, + struct list_head *buffer_list, xlog_recover_item_t *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, item); + return xlog_recover_buffer_pass2(log, buffer_list, item); case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, item); + return xlog_recover_inode_pass2(log, buffer_list, item); case XFS_LI_EFI: return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, item); + return xlog_recover_dquot_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; @@ -2750,8 +2757,9 @@ xlog_recover_commit_trans( struct xlog_recover *trans, int pass) { - int error = 0; + int error = 0, error2; xlog_recover_item_t *item; + LIST_HEAD (buffer_list); hlist_del(&trans->r_list); @@ -2760,16 +2768,27 @@ xlog_recover_commit_trans( return error; list_for_each_entry(item, &trans->r_itemq, ri_list) { - if (pass == XLOG_RECOVER_PASS1) + switch (pass) { + case XLOG_RECOVER_PASS1: error = xlog_recover_commit_pass1(log, trans, item); - else - error = xlog_recover_commit_pass2(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + error = xlog_recover_commit_pass2(log, trans, + &buffer_list, item); + break; + default: + ASSERT(0); + } + if (error) - return error; + goto out; } xlog_recover_free_trans(trans); - return 0; + +out: + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; } STATIC int @@ -3079,7 +3098,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); + error = xfs_itobp(mp, NULL, ip, &dip, &ibp, 0); if (error) goto fail_iput; @@ -3639,11 +3658,8 @@ xlog_do_recover( * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); - if (error) { + if (error) return error; - } - - xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. @@ -3670,7 +3686,6 @@ xlog_do_recover( bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); xfsbdstrat(log->l_mp, bp); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index bd672de..331cd9f 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 1ffead4b..536021f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -22,6 +22,7 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir2.h" @@ -37,7 +38,6 @@ #include "xfs_rtalloc.h" #include "xfs_bmap.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_fsops.h" #include "xfs_utils.h" @@ -683,8 +683,8 @@ xfs_readsb(xfs_mount_t *mp, int flags) sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); reread: - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, - XFS_SB_DADDR, sector_size, 0); + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0); if (!bp) { if (loud) xfs_warn(mp, "SB buffer read failed"); @@ -1032,9 +1032,9 @@ xfs_check_sizes(xfs_mount_t *mp) xfs_warn(mp, "filesystem size mismatch detected"); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, + bp = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); + XFS_FSS_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "last sector read failed"); return EIO; @@ -1047,9 +1047,9 @@ xfs_check_sizes(xfs_mount_t *mp) xfs_warn(mp, "log size mismatch detected"); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, + bp = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "log device read failed"); return EIO; @@ -1288,7 +1288,7 @@ xfs_mountfs( XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); if (error) { xfs_warn(mp, "log mount failed"); - goto out_free_perag; + goto out_fail_wait; } /* @@ -1315,7 +1315,7 @@ xfs_mountfs( !mp->m_sb.sb_inprogress) { error = xfs_initialize_perag_data(mp, sbp->sb_agcount); if (error) - goto out_free_perag; + goto out_fail_wait; } /* @@ -1439,6 +1439,10 @@ xfs_mountfs( IRELE(rip); out_log_dealloc: xfs_log_unmount(mp); + out_fail_wait: + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_wait_buftarg(mp->m_logdev_targp); + xfs_wait_buftarg(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); out_remove_uuid: @@ -1475,15 +1479,15 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* - * Do a delwri reclaim pass first so that as many dirty inodes are - * queued up for IO as possible. Then flush the buffers before making - * a synchronous path to catch all the remaining inodes are reclaimed. - * This makes the reclaim process as quick as possible by avoiding - * synchronous writeout and blocking on inodes already in the delwri - * state as much as possible. + * Flush all pending changes from the AIL. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * And reclaim all inodes. At this point there should be no dirty + * inode, and none should be pinned or locked, but use synchronous + * reclaim just to be sure. */ - xfs_reclaim_inodes(mp, 0); - xfs_flush_buftarg(mp->m_ddev_targp, 1); xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); @@ -1519,15 +1523,12 @@ xfs_unmountfs( if (error) xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); - xfs_unmountfs_writesb(mp); /* - * Make sure all buffers have been flushed and completed before - * unmounting the log. + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. */ - error = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (error) - xfs_warn(mp, "%d busy buffers during unmount.", error); + xfs_ail_push_all_sync(mp->m_ail); xfs_wait_buftarg(mp->m_ddev_targp); xfs_log_unmount_write(mp); @@ -1588,36 +1589,6 @@ xfs_log_sbcount(xfs_mount_t *mp) return error; } -int -xfs_unmountfs_writesb(xfs_mount_t *mp) -{ - xfs_buf_t *sbp; - int error = 0; - - /* - * skip superblock write if fs is read-only, or - * if we are doing a forced umount. - */ - if (!((mp->m_flags & XFS_MOUNT_RDONLY) || - XFS_FORCED_SHUTDOWN(mp))) { - - sbp = xfs_getsb(mp, 0); - - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - xfs_buf_delwri_dequeue(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - ASSERT(sbp->b_target == mp->m_ddev_targp); - xfsbdstrat(mp, sbp); - error = xfs_buf_iowait(sbp); - if (error) - xfs_buf_ioerror_alert(sbp, __func__); - xfs_buf_relse(sbp); - } - return error; -} - /* * xfs_mod_sb() can be used to copy arbitrary changes to the * in-core superblock into the superblock buffer to be logged. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9eba738..8b89c5a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -214,6 +214,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; } xfs_mount_t; /* @@ -378,7 +379,6 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 55c6afe..249db19 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -65,7 +64,8 @@ STATIC int xfs_qm_dquot_walk( struct xfs_mount *mp, int type, - int (*execute)(struct xfs_dquot *dqp)) + int (*execute)(struct xfs_dquot *dqp, void *data), + void *data) { struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); @@ -97,7 +97,7 @@ restart: next_index = be32_to_cpu(dqp->q_core.d_id) + 1; - error = execute(batch[i]); + error = execute(batch[i], data); if (error == EAGAIN) { skipped++; continue; @@ -129,7 +129,8 @@ restart: */ STATIC int xfs_qm_dqpurge( - struct xfs_dquot *dqp) + struct xfs_dquot *dqp, + void *data) { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -153,21 +154,7 @@ xfs_qm_dqpurge( dqp->dq_flags |= XFS_DQ_FREEING; - /* - * If we're turning off quotas, we have to make sure that, for - * example, we don't delete quota disk blocks while dquots are - * in the process of getting written to those disk blocks. - * This dquot might well be on AIL, and we can't leave it there - * if we're turning off quotas. Basically, we need this flush - * lock, and are willing to block on it. - */ - if (!xfs_dqflock_nowait(dqp)) { - /* - * Block on the flush lock after nudging dquot buffer, - * if it is incore. - */ - xfs_dqflock_pushbuf_wait(dqp); - } + xfs_dqflock(dqp); /* * If we are turning this type of quotas off, we don't care @@ -175,16 +162,21 @@ xfs_qm_dqpurge( * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { - int error; + struct xfs_buf *bp = NULL; + int error; /* * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ - error = xfs_qm_dqflush(dqp, SYNC_WAIT); - if (error) + error = xfs_qm_dqflush(dqp, &bp); + if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); + } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } xfs_dqflock(dqp); } @@ -226,11 +218,11 @@ xfs_qm_dqpurge_all( uint flags) { if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge); + xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -483,6 +475,23 @@ done: xfs_dqunlock(udq); } +static bool +xfs_qm_need_dqattach( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return false; + if (!XFS_IS_QUOTA_ON(mp)) + return false; + if (!XFS_NOT_DQATTACHED(mp, ip)) + return false; + if (ip->i_ino == mp->m_sb.sb_uquotino || + ip->i_ino == mp->m_sb.sb_gquotino) + return false; + return true; +} /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON @@ -500,11 +509,7 @@ xfs_qm_dqattach_locked( uint nquotas = 0; int error = 0; - if (!XFS_IS_QUOTA_RUNNING(mp) || - !XFS_IS_QUOTA_ON(mp) || - !XFS_NOT_DQATTACHED(mp, ip) || - ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + if (!xfs_qm_need_dqattach(ip)) return 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -575,6 +580,9 @@ xfs_qm_dqattach( { int error; + if (!xfs_qm_need_dqattach(ip)) + return 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, flags); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -855,15 +863,16 @@ xfs_qm_reset_dqcounts( STATIC int xfs_qm_dqiter_bufs( - xfs_mount_t *mp, - xfs_dqid_t firstid, - xfs_fsblock_t bno, - xfs_filblks_t blkcnt, - uint flags) + struct xfs_mount *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags, + struct list_head *buffer_list) { - xfs_buf_t *bp; - int error; - int type; + struct xfs_buf *bp; + int error; + int type; ASSERT(blkcnt > 0); type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : @@ -887,7 +896,7 @@ xfs_qm_dqiter_bufs( break; xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); /* * goto the next block. @@ -895,6 +904,7 @@ xfs_qm_dqiter_bufs( bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } + return error; } @@ -904,11 +914,12 @@ xfs_qm_dqiter_bufs( */ STATIC int xfs_qm_dqiterate( - xfs_mount_t *mp, - xfs_inode_t *qip, - uint flags) + struct xfs_mount *mp, + struct xfs_inode *qip, + uint flags, + struct list_head *buffer_list) { - xfs_bmbt_irec_t *map; + struct xfs_bmbt_irec *map; int i, nmaps; /* number of map entries */ int error; /* return value */ xfs_fileoff_t lblkno; @@ -975,21 +986,17 @@ xfs_qm_dqiterate( * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ - if ((error = xfs_qm_dqiter_bufs(mp, - firstid, - map[i].br_startblock, - map[i].br_blockcount, - flags))) { - break; - } + error = xfs_qm_dqiter_bufs(mp, firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags, buffer_list); + if (error) + goto out; } - - if (error) - break; } while (nmaps > 0); +out: kmem_free(map); - return error; } @@ -1182,8 +1189,11 @@ error0: STATIC int xfs_qm_flush_one( - struct xfs_dquot *dqp) + struct xfs_dquot *dqp, + void *data) { + struct list_head *buffer_list = data; + struct xfs_buf *bp = NULL; int error = 0; xfs_dqlock(dqp); @@ -1192,11 +1202,13 @@ xfs_qm_flush_one( if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - if (!xfs_dqflock_nowait(dqp)) - xfs_dqflock_pushbuf_wait(dqp); - - error = xfs_qm_dqflush(dqp, 0); + xfs_dqflock(dqp); + error = xfs_qm_dqflush(dqp, &bp); + if (error) + goto out_unlock; + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); out_unlock: xfs_dqunlock(dqp); return error; @@ -1215,6 +1227,7 @@ xfs_qm_quotacheck( size_t structsz; xfs_inode_t *uip, *gip; uint flags; + LIST_HEAD (buffer_list); count = INT_MAX; structsz = 1; @@ -1233,7 +1246,8 @@ xfs_qm_quotacheck( */ uip = mp->m_quotainfo->qi_uquotaip; if (uip) { - error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + &buffer_list); if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; @@ -1242,7 +1256,8 @@ xfs_qm_quotacheck( gip = mp->m_quotainfo->qi_gquotaip; if (gip) { error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA, + &buffer_list); if (error) goto error_return; flags |= XFS_OQUOTA_CHKD; @@ -1265,19 +1280,27 @@ xfs_qm_quotacheck( * We've made all the changes that we need to make incore. Flush them * down to disk buffers if everything was updated successfully. */ - if (XFS_IS_UQUOTA_ON(mp)) - error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one); + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + &buffer_list); + } if (XFS_IS_GQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one); + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + &buffer_list); if (!error) error = error2; } if (XFS_IS_PQUOTA_ON(mp)) { - error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one); + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + &buffer_list); if (!error) error = error2; } + error2 = xfs_buf_delwri_submit(&buffer_list); + if (!error) + error = error2; + /* * We can get this error if we couldn't do a dquot allocation inside * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the @@ -1291,15 +1314,6 @@ xfs_qm_quotacheck( } /* - * We didn't log anything, because if we crashed, we'll have to - * start the quotacheck from scratch anyway. However, we must make - * sure that our dquot changes are secure before we put the - * quotacheck'd stamp on the superblock. So, here we do a synchronous - * flush. - */ - xfs_flush_buftarg(mp->m_ddev_targp, 1); - - /* * If one type of quotas is off, then it will lose its * quotachecked status, since we won't be doing accounting for * that type anymore. @@ -1308,6 +1322,13 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: + while (!list_empty(&buffer_list)) { + struct xfs_buf *bp = + list_first_entry(&buffer_list, struct xfs_buf, b_list); + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } + if (error) { xfs_warn(mp, "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", @@ -1424,6 +1445,7 @@ xfs_qm_dqfree_one( STATIC void xfs_qm_dqreclaim_one( struct xfs_dquot *dqp, + struct list_head *buffer_list, struct list_head *dispose_list) { struct xfs_mount *mp = dqp->q_mount; @@ -1456,25 +1478,20 @@ xfs_qm_dqreclaim_one( if (!xfs_dqflock_nowait(dqp)) goto out_busy; - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + trace_xfs_dqreclaim_dirty(dqp); - /* - * We flush it delayed write, so don't bother releasing the - * freelist lock. - */ - error = xfs_qm_dqflush(dqp, 0); + error = xfs_qm_dqflush(dqp, &bp); if (error) { xfs_warn(mp, "%s: dquot %p flush failed", __func__, dqp); + goto out_busy; } + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); /* * Give the dquot another try on the freelist, as the * flushing will take some time. @@ -1518,8 +1535,10 @@ xfs_qm_shake( struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); int nr_to_scan = sc->nr_to_scan; + LIST_HEAD (buffer_list); LIST_HEAD (dispose_list); struct xfs_dquot *dqp; + int error; if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; @@ -1532,15 +1551,20 @@ xfs_qm_shake( break; dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, q_lru); - xfs_qm_dqreclaim_one(dqp, &dispose_list); + xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list); } mutex_unlock(&qi->qi_lru_lock); + error = xfs_buf_delwri_submit(&buffer_list); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + while (!list_empty(&dispose_list)) { dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); list_del_init(&dqp->q_lru); xfs_qm_dqfree_one(dqp); } + out: return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; } diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index e6986b5..6b39115 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index c4f396e..858a3b1 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -22,7 +22,6 @@ #include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 7e76f53..fed504f 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -17,7 +17,6 @@ */ #include "xfs.h" #include "xfs_sb.h" -#include "xfs_inum.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index e44ef7e..30ff5f4 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ca4f315..92d4331 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -20,7 +20,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -34,7 +33,6 @@ #include "xfs_rtalloc.h" #include "xfs_fsops.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_inode_item.h" #include "xfs_trans_space.h" #include "xfs_utils.h" @@ -1872,9 +1870,9 @@ xfs_growfs_rt( /* * Read in the last block of the device, make sure it exists. */ - bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) return EIO; xfs_buf_relse(bp); @@ -2219,9 +2217,9 @@ xfs_rtmount_init( (unsigned long long) mp->m_sb.sb_rblocks); return XFS_ERROR(EFBIG); } - bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp, + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_B(mp, 1), 0); + XFS_FSB_TO_BB(mp, 1), 0); if (!bp) { xfs_warn(mp, "realtime device size check failed"); return EIO; diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c deleted file mode 100644 index 597d044..0000000 --- a/fs/xfs/xfs_rw.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_error.h" -#include "xfs_rw.h" - -/* - * Force a shutdown of the filesystem instantly while keeping - * the filesystem consistent. We don't do an unmount here; just shutdown - * the shop, make sure that absolutely nothing persistent happens to - * this filesystem after this point. - */ -void -xfs_do_force_shutdown( - xfs_mount_t *mp, - int flags, - char *fname, - int lnnum) -{ - int logerror; - - logerror = flags & SHUTDOWN_LOG_IO_ERROR; - - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = 0x%p", - __func__, flags, lnnum, fname, __return_address); - } - /* - * No need to duplicate efforts. - */ - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) - return; - - /* - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't - * queue up anybody new on the log reservations, and wakes up - * everybody who's sleeping on log reservations to tell them - * the bad news. - */ - if (xfs_log_force_umount(mp, logerror)) - return; - - if (flags & SHUTDOWN_CORRUPT_INCORE) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, - "Corruption of in-memory data detected. Shutting down filesystem"); - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) - xfs_stack_trace(); - } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); - } - } - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_alert(mp, - "Please umount the filesystem and rectify the problem(s)"); - } -} - -/* - * This isn't an absolute requirement, but it is - * just a good idea to call xfs_read_buf instead of - * directly doing a read_buf call. For one, we shouldn't - * be doing this disk read if we are in SHUTDOWN state anyway, - * so this stops that from happening. Secondly, this does all - * the error checking stuff and the brelse if appropriate for - * the caller, so the code can be a little leaner. - */ - -int -xfs_read_buf( - struct xfs_mount *mp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len, - uint flags, - xfs_buf_t **bpp) -{ - xfs_buf_t *bp; - int error; - - if (!flags) - flags = XBF_LOCK | XBF_MAPPED; - - bp = xfs_buf_read(target, blkno, len, flags); - if (!bp) - return XFS_ERROR(EIO); - error = bp->b_error; - if (!error && !XFS_FORCED_SHUTDOWN(mp)) { - *bpp = bp; - } else { - *bpp = NULL; - if (error) { - xfs_buf_ioerror_alert(bp, __func__); - } else { - error = XFS_ERROR(EIO); - } - if (bp) { - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - /* - * brelse clears B_ERROR and b_error - */ - xfs_buf_relse(bp); - } - } - return (error); -} - -/* - * helper function to extract extent size hint from inode - */ -xfs_extlen_t -xfs_get_extsz_hint( - struct xfs_inode *ip) -{ - if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) - return ip->i_d.di_extsize; - if (XFS_IS_REALTIME_INODE(ip)) - return ip->i_mount->m_sb.sb_rextsize; - return 0; -} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h deleted file mode 100644 index bbdb9ad..0000000 --- a/fs/xfs/xfs_rw.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_RW_H__ -#define __XFS_RW_H__ - -struct xfs_buf; -struct xfs_inode; -struct xfs_mount; - -/* - * Convert the given file system block to a disk block. - * We have to treat it differently based on whether the - * file is a real time file or not, because the bmap code - * does. - */ -static inline xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (XFS_IS_REALTIME_INODE(ip) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} - -/* - * Prototypes for functions in xfs_rw.c. - */ -extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, - xfs_daddr_t blkno, int len, uint flags, - struct xfs_buf **bpp); -extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); - -#endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index dab9a5f..0d9de41 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -17,7 +17,6 @@ */ #include "xfs.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -622,7 +621,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); } STATIC void @@ -773,8 +772,14 @@ xfs_init_mount_workqueues( if (!mp->m_unwritten_workqueue) goto out_destroy_data_iodone_queue; + mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_cil_workqueue) + goto out_destroy_unwritten; return 0; +out_destroy_unwritten: + destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: destroy_workqueue(mp->m_data_workqueue); out: @@ -785,6 +790,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); } @@ -926,7 +932,7 @@ xfs_fs_evict_inode( trace_xfs_evict_inode(ip); truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); XFS_STATS_DEC(vn_active); @@ -981,18 +987,9 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - xfs_syncd_stop(mp); - - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ xfs_filestream_unmount(mp); - - xfs_flush_buftarg(mp->m_ddev_targp, 1); - xfs_unmountfs(mp); + xfs_syncd_stop(mp); xfs_freesb(mp); xfs_icsb_destroy_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1072,7 +1069,7 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); - if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || + if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) xfs_qm_statvfs(ip, statp); @@ -1362,31 +1359,32 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_mountfs(mp); + error = xfs_syncd_init(mp); if (error) goto out_filestream_unmount; - error = xfs_syncd_init(mp); + error = xfs_mountfs(mp); if (error) - goto out_unmount; + goto out_syncd_stop; root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; - goto out_syncd_stop; + goto out_unmount; } if (is_bad_inode(root)) { error = EINVAL; - goto out_syncd_stop; + goto out_unmount; } sb->s_root = d_make_root(root); if (!sb->s_root) { error = ENOMEM; - goto out_syncd_stop; + goto out_unmount; } return 0; - + out_syncd_stop: + xfs_syncd_stop(mp); out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: @@ -1403,19 +1401,10 @@ out_destroy_workqueues: out: return -error; - out_syncd_stop: - xfs_syncd_stop(mp); out_unmount: - /* - * Blow away any referenced inode in the filestreams cache. - * This can and will cause log traffic as inodes go inactive - * here. - */ xfs_filestream_unmount(mp); - - xfs_flush_buftarg(mp->m_ddev_targp, 1); - xfs_unmountfs(mp); + xfs_syncd_stop(mp); goto out_free_sb; } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 205ebcb..c9d3409 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" @@ -241,45 +240,6 @@ xfs_sync_inode_data( return error; } -STATIC int -xfs_sync_inode_attr( - struct xfs_inode *ip, - struct xfs_perag *pag, - int flags) -{ - int error = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_inode_clean(ip)) - goto out_unlock; - if (!xfs_iflock_nowait(ip)) { - if (!(flags & SYNC_WAIT)) - goto out_unlock; - xfs_iflock(ip); - } - - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - goto out_unlock; - } - - error = xfs_iflush(ip, flags); - - /* - * We don't want to try again on non-blocking flushes that can't run - * again immediately. If an inode really must be written, then that's - * what the SYNC_WAIT flag is for. - */ - if (error == EAGAIN) { - ASSERT(!(flags & SYNC_WAIT)); - error = 0; - } - - out_unlock: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} - /* * Write out pagecache data for the whole filesystem. */ @@ -300,19 +260,6 @@ xfs_sync_data( return 0; } -/* - * Write out inode metadata (attributes) for the whole filesystem. - */ -STATIC int -xfs_sync_attr( - struct xfs_mount *mp, - int flags) -{ - ASSERT((flags & ~SYNC_WAIT) == 0); - - return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -350,7 +297,7 @@ xfs_sync_fsdata( * First stage of freeze - no writers will make progress now we are here, * so we flush delwri and delalloc buffers here, then wait for all I/O to * complete. Data is frozen at that point. Metadata is not frozen, - * transactions can still occur here so don't bother flushing the buftarg + * transactions can still occur here so don't bother emptying the AIL * because it'll just get dirty again. */ int @@ -365,47 +312,13 @@ xfs_quiesce_data( /* write superblock and hoover up shutdown errors */ error = xfs_sync_fsdata(mp); - /* make sure all delwri buffers are written out */ - xfs_flush_buftarg(mp->m_ddev_targp, 1); - /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) error2 = xfs_fs_log_dummy(mp); - /* flush data-only devices */ - if (mp->m_rtdev_targp) - xfs_flush_buftarg(mp->m_rtdev_targp, 1); - return error ? error : error2; } -STATIC void -xfs_quiesce_fs( - struct xfs_mount *mp) -{ - int count = 0, pincount; - - xfs_reclaim_inodes(mp, 0); - xfs_flush_buftarg(mp->m_ddev_targp, 0); - - /* - * This loop must run at least twice. The first instance of the loop - * will flush most meta data but that will generate more meta data - * (typically directory updates). Which then must be flushed and - * logged before we can write the unmount record. We also so sync - * reclaim of inodes to catch any that the above delwri flush skipped. - */ - do { - xfs_reclaim_inodes(mp, SYNC_WAIT); - xfs_sync_attr(mp, SYNC_WAIT); - pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (!pincount) { - delay(50); - count++; - } - } while (count < 2); -} - /* * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to @@ -421,8 +334,12 @@ xfs_quiesce_attr( while (atomic_read(&mp->m_active_trans) > 0) delay(100); - /* flush inodes and push all remaining buffers out to disk */ - xfs_quiesce_fs(mp); + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* flush all pending changes from the AIL */ + xfs_ail_push_all_sync(mp->m_ail); /* * Just warn here till VFS can correctly support @@ -436,7 +353,12 @@ xfs_quiesce_attr( xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); + + /* + * At this point we might have modified the superblock again and thus + * added an item to the AIL, thus flush it again. + */ + xfs_ail_push_all_sync(mp->m_ail); } static void @@ -460,16 +382,27 @@ xfs_sync_worker( struct xfs_mount, m_sync_work); int error; - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); - - /* start pushing all the metadata that is currently dirty */ - xfs_ail_push_all(mp->m_ail); + /* + * We shouldn't write/force the log if we are in the mount/unmount + * process or on a read only filesystem. The workqueue still needs to be + * active in both cases, however, because it is used for inode reclaim + * during these times. Use the s_umount semaphore to provide exclusion + * with unmount. + */ + if (down_read_trylock(&mp->m_super->s_umount)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + /* dgc: errors ignored here */ + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently + * dirty */ + xfs_ail_push_all(mp->m_ail); + } + up_read(&mp->m_super->s_umount); } /* queue us up again */ @@ -488,14 +421,6 @@ xfs_syncd_queue_reclaim( struct xfs_mount *mp) { - /* - * We can have inodes enter reclaim after we've shut down the syncd - * workqueue during unmount, so don't allow reclaim work to be queued - * during unmount. - */ - if (!(mp->m_super->s_flags & MS_ACTIVE)) - return; - rcu_read_lock(); if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, @@ -564,7 +489,6 @@ xfs_syncd_init( INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); xfs_syncd_queue_sync(mp); - xfs_syncd_queue_reclaim(mp); return 0; } @@ -702,11 +626,8 @@ xfs_reclaim_inode_grab( } /* - * Inodes in different states need to be treated differently, and the return - * value of xfs_iflush is not sufficient to get this right. The following table - * lists the inode states and the reclaim actions necessary for non-blocking - * reclaim: - * + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: * * inode state iflush ret required action * --------------- ---------- --------------- @@ -716,39 +637,31 @@ xfs_reclaim_inode_grab( * stale, unpinned 0 reclaim * clean, pinned(*) 0 requeue * stale, pinned EAGAIN requeue - * dirty, delwri ok 0 requeue - * dirty, delwri blocked EAGAIN requeue - * dirty, sync flush 0 reclaim + * dirty, async - requeue + * dirty, sync 0 reclaim * * (*) dgc: I don't think the clean, pinned state is possible but it gets * handled anyway given the order of checks implemented. * - * As can be seen from the table, the return value of xfs_iflush() is not - * sufficient to correctly decide the reclaim action here. The checks in - * xfs_iflush() might look like duplicates, but they are not. - * * Also, because we get the flush lock first, we know that any inode that has * been flushed delwri has had the flush completed by the time we check that - * the inode is clean. The clean inode check needs to be done before flushing - * the inode delwri otherwise we would loop forever requeuing clean inodes as - * we cannot tell apart a successful delwri flush and a clean inode from the - * return value of xfs_iflush(). + * the inode is clean. * - * Note that because the inode is flushed delayed write by background - * writeback, the flush lock may already be held here and waiting on it can - * result in very long latencies. Hence for sync reclaims, where we wait on the - * flush lock, the caller should push out delayed write inodes first before - * trying to reclaim them to minimise the amount of time spent waiting. For - * background relaim, we just requeue the inode for the next pass. + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. * * Hence the order of actions after gaining the locks should be: * bad => reclaim * shutdown => unpin and reclaim - * pinned, delwri => requeue + * pinned, async => requeue * pinned, sync => unpin * stale => reclaim * clean => reclaim - * dirty, delwri => flush and requeue + * dirty, async => requeue * dirty, sync => flush, wait and reclaim */ STATIC int @@ -757,7 +670,8 @@ xfs_reclaim_inode( struct xfs_perag *pag, int sync_mode) { - int error; + struct xfs_buf *bp = NULL; + int error; restart: error = 0; @@ -765,17 +679,6 @@ restart: if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; - - /* - * If we only have a single dirty inode in a cluster there is - * a fair chance that the AIL push may have pushed it into - * the buffer, but xfsbufd won't touch it until 30 seconds - * from now, and thus we will lock up here. - * - * Promote the inode buffer to the front of the delwri list - * and wake up xfsbufd now. - */ - xfs_promote_inode(ip); xfs_iflock(ip); } @@ -783,13 +686,12 @@ restart: goto reclaim; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); goto reclaim; } if (xfs_ipincount(ip)) { - if (!(sync_mode & SYNC_WAIT)) { - xfs_ifunlock(ip); - goto out; - } + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; xfs_iunpin_wait(ip); } if (xfs_iflags_test(ip, XFS_ISTALE)) @@ -798,60 +700,42 @@ restart: goto reclaim; /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* * Now we have an inode that needs flushing. * - * We do a nonblocking flush here even if we are doing a SYNC_WAIT - * reclaim as we can deadlock with inode cluster removal. + * Note that xfs_iflush will never block on the inode buffer lock, as * xfs_ifree_cluster() can lock the inode buffer before it locks the - * ip->i_lock, and we are doing the exact opposite here. As a result, - * doing a blocking xfs_itobp() to get the cluster buffer will result + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_itobp() to get the cluster buffer would result * in an ABBA deadlock with xfs_ifree_cluster(). * * As xfs_ifree_cluser() must gather all inodes that are active in the * cache to mark them stale, if we hit this case we don't actually want * to do IO here - we want the inode marked stale so we can simply - * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, - * just unlock the inode, back off and try again. Hopefully the next - * pass through will see the stale flag set on the inode. + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. */ - error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); - if (sync_mode & SYNC_WAIT) { - if (error == EAGAIN) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* backoff longer than in xfs_ifree_cluster */ - delay(2); - goto restart; - } - xfs_iflock(ip); - goto reclaim; + error = xfs_iflush(ip, &bp); + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; } - /* - * When we have to flush an inode but don't have SYNC_WAIT set, we - * flush the inode out using a delwri buffer and wait for the next - * call into reclaim to find it in a clean state instead of waiting for - * it now. We also don't return errors here - if the error is transient - * then the next reclaim pass will flush the inode, and if the error - * is permanent then the next sync reclaim will reclaim the inode and - * pass on the error. - */ - if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_warn(ip->i_mount, - "inode 0x%llx background reclaim flush failed with %d", - (long long)ip->i_ino, error); + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); } -out: - xfs_iflags_clear(ip, XFS_IRECLAIM); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * We could return EAGAIN here to make reclaim rescan the inode tree in - * a short while. However, this just burns CPU time scanning the tree - * waiting for IO to complete and xfssyncd never goes back to the idle - * state. Instead, return 0 to let the next scheduled background reclaim - * attempt to reclaim the inode again. - */ - return 0; + xfs_iflock(ip); reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -884,8 +768,21 @@ reclaim: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_inode_free(ip); - return error; + +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and xfssyncd never goes back to the idle + * state. Instead, return 0 to let the next scheduled background reclaim + * attempt to reclaim the inode again. + */ + return 0; } /* diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 9010ce8..624bedd 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 06838c4..7cf9d35 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -281,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, bno) - __field(size_t, buffer_length) + __field(int, nblks) __field(int, hold) __field(int, pincount) __field(unsigned, lockval) @@ -291,18 +291,18 @@ DECLARE_EVENT_CLASS(xfs_buf_class, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " "lock %d flags %s caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, - __entry->buffer_length, + __entry->nblks, __entry->hold, __entry->pincount, __entry->lockval, @@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); -DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); +DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); @@ -362,7 +362,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->buffer_length = BBTOB(bp->b_length); __entry->flags = flags; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -406,7 +406,7 @@ TRACE_EVENT(xfs_buf_ioerror, TP_fast_assign( __entry->dev = bp->b_target->bt_dev; __entry->bno = bp->b_bn; - __entry->buffer_length = bp->b_buffer_length; + __entry->buffer_length = BBTOB(bp->b_length); __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; @@ -450,7 +450,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_recur = bip->bli_recur; __entry->bli_refcount = atomic_read(&bip->bli_refcount); __entry->buf_bno = bip->bli_buf->b_bn; - __entry->buf_len = bip->bli_buf->b_buffer_length; + __entry->buf_len = BBTOB(bip->bli_buf->b_length); __entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); @@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); @@ -876,15 +874,30 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) ) +TRACE_EVENT(xfs_log_force, + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), + TP_ARGS(mp, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn) +) + #define DEFINE_LOG_ITEM_EVENT(name) \ DEFINE_EVENT(xfs_log_item_class, name, \ TP_PROTO(struct xfs_log_item *lip), \ TP_ARGS(lip)) DEFINE_LOG_ITEM_EVENT(xfs_ail_push); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf); -DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); +DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); DECLARE_EVENT_CLASS(xfs_file_class, @@ -1145,7 +1158,7 @@ TRACE_EVENT(xfs_bunmap, ); -DECLARE_EVENT_CLASS(xfs_busy_class, +DECLARE_EVENT_CLASS(xfs_extent_busy_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), TP_ARGS(mp, agno, agbno, len), @@ -1168,17 +1181,17 @@ DECLARE_EVENT_CLASS(xfs_busy_class, __entry->len) ); #define DEFINE_BUSY_EVENT(name) \ -DEFINE_EVENT(xfs_busy_class, name, \ +DEFINE_EVENT(xfs_extent_busy_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ xfs_agblock_t agbno, xfs_extlen_t len), \ TP_ARGS(mp, agno, agbno, len)) -DEFINE_BUSY_EVENT(xfs_alloc_busy); -DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem); -DEFINE_BUSY_EVENT(xfs_alloc_busy_force); -DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse); -DEFINE_BUSY_EVENT(xfs_alloc_busy_clear); +DEFINE_BUSY_EVENT(xfs_extent_busy); +DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); +DEFINE_BUSY_EVENT(xfs_extent_busy_force); +DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); +DEFINE_BUSY_EVENT(xfs_extent_busy_clear); -TRACE_EVENT(xfs_alloc_busy_trim, +TRACE_EVENT(xfs_extent_busy_trim, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, xfs_agblock_t tbno, xfs_extlen_t tlen), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 103b00c..cdf896f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -19,9 +19,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -36,6 +34,7 @@ #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_bmap.h" #include "xfs_quota.h" #include "xfs_trans_priv.h" @@ -608,8 +607,8 @@ STATIC void xfs_trans_free( struct xfs_trans *tp) { - xfs_alloc_busy_sort(&tp->t_busy); - xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false); + xfs_extent_busy_sort(&tp->t_busy); + xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f611870..7ab99e1 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -345,11 +345,9 @@ struct xfs_item_ops { void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); void (*iop_unpin)(xfs_log_item_t *, int remove); - uint (*iop_trylock)(xfs_log_item_t *); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); - void (*iop_push)(xfs_log_item_t *); - bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); }; @@ -357,20 +355,18 @@ struct xfs_item_ops { #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) #define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) -#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) +#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) #define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) -#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) -#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) #define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) /* - * Return values for the IOP_TRYLOCK() routines. + * Return values for the IOP_PUSH() routines. */ -#define XFS_ITEM_SUCCESS 0 -#define XFS_ITEM_PINNED 1 -#define XFS_ITEM_LOCKED 2 -#define XFS_ITEM_PUSHBUF 3 +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_FLUSHING 3 /* * This is the type of function which can be given to xfs_trans_callback() diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1dead07..9c51448 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -20,7 +20,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -79,7 +78,7 @@ xfs_ail_check( * Return a pointer to the first item in the AIL. If the AIL is empty, then * return NULL. */ -static xfs_log_item_t * +xfs_log_item_t * xfs_ail_min( struct xfs_ail *ailp) { @@ -364,30 +363,31 @@ xfsaild_push( xfs_log_item_t *lip; xfs_lsn_t lsn; xfs_lsn_t target; - long tout = 10; + long tout; int stuck = 0; + int flushing = 0; int count = 0; - int push_xfsbufd = 0; /* - * If last time we ran we encountered pinned items, force the log first - * and wait for it before pushing again. + * If we encountered pinned items or did not finish writing out all + * buffers the last time we ran, force the log first and wait for it + * before pushing again. */ - spin_lock(&ailp->xa_lock); - if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && - !list_empty(&ailp->xa_ail)) { + if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 && + (!list_empty_careful(&ailp->xa_buf_list) || + xfs_ail_min_lsn(ailp))) { ailp->xa_log_flush = 0; - spin_unlock(&ailp->xa_lock); + XFS_STATS_INC(xs_push_ail_flush); xfs_log_force(mp, XFS_LOG_SYNC); - spin_lock(&ailp->xa_lock); } - target = ailp->xa_target; + spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); - if (!lip || XFS_FORCED_SHUTDOWN(mp)) { + if (!lip) { /* - * AIL is empty or our push has reached the end. + * If the AIL is empty or our push has reached the end we are + * done now. */ xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); @@ -396,54 +396,42 @@ xfsaild_push( XFS_STATS_INC(xs_push_ail); - /* - * While the item we are looking at is below the given threshold - * try to flush it out. We'd like not to stop until we've at least - * tried to push on everything in the AIL with an LSN less than - * the given threshold. - * - * However, we will stop after a certain number of pushes and wait - * for a reduced timeout to fire before pushing further. This - * prevents use from spinning when we can't do anything or there is - * lots of contention on the AIL lists. - */ lsn = lip->li_lsn; + target = ailp->xa_target; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { int lock_result; + /* - * If we can lock the item without sleeping, unlock the AIL - * lock and flush the item. Then re-grab the AIL lock so we - * can look for the next item on the AIL. List changes are - * handled by the AIL lookup functions internally - * - * If we can't lock the item, either its holder will flush it - * or it is already being flushed or it is being relogged. In - * any of these case it is being taken care of and we can just - * skip to the next item in the list. + * Note that IOP_PUSH may unlock and reacquire the AIL lock. We + * rely on the AIL cursor implementation to be able to deal with + * the dropped lock. */ - lock_result = IOP_TRYLOCK(lip); - spin_unlock(&ailp->xa_lock); + lock_result = IOP_PUSH(lip, &ailp->xa_buf_list); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); trace_xfs_ail_push(lip); - IOP_PUSH(lip); ailp->xa_last_pushed_lsn = lsn; break; - case XFS_ITEM_PUSHBUF: - XFS_STATS_INC(xs_push_ail_pushbuf); - trace_xfs_ail_pushbuf(lip); - - if (!IOP_PUSHBUF(lip)) { - trace_xfs_ail_pushbuf_pinned(lip); - stuck++; - ailp->xa_log_flush++; - } else { - ailp->xa_last_pushed_lsn = lsn; - } - push_xfsbufd = 1; + case XFS_ITEM_FLUSHING: + /* + * The item or its backing buffer is already beeing + * flushed. The typical reason for that is that an + * inode buffer is locked because we already pushed the + * updates to it as part of inode clustering. + * + * We do not want to to stop flushing just because lots + * of items are already beeing flushed, but we need to + * re-try the flushing relatively soon if most of the + * AIL is beeing flushed. + */ + XFS_STATS_INC(xs_push_ail_flushing); + trace_xfs_ail_flushing(lip); + + flushing++; + ailp->xa_last_pushed_lsn = lsn; break; case XFS_ITEM_PINNED: @@ -453,28 +441,22 @@ xfsaild_push( stuck++; ailp->xa_log_flush++; break; - case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); trace_xfs_ail_locked(lip); + stuck++; break; - default: ASSERT(0); break; } - spin_lock(&ailp->xa_lock); - /* should we bother continuing? */ - if (XFS_FORCED_SHUTDOWN(mp)) - break; - ASSERT(mp->m_log); - count++; /* * Are there too many items we can't do anything with? + * * If we we are skipping too many items because we can't flush * them or they are already being flushed, we back off and * given them time to complete whatever operation is being @@ -496,42 +478,36 @@ xfsaild_push( xfs_trans_ail_cursor_done(ailp, &cur); spin_unlock(&ailp->xa_lock); - if (push_xfsbufd) { - /* we've got delayed write buffers to flush */ - wake_up_process(mp->m_ddev_targp->bt_task); - } + if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) + ailp->xa_log_flush++; - /* assume we have more work to do in a short while */ + if (!count || XFS_LSN_CMP(lsn, target) >= 0) { out_done: - if (!count) { - /* We're past our target or empty, so idle */ - ailp->xa_last_pushed_lsn = 0; - ailp->xa_log_flush = 0; - - tout = 50; - } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* - * We reached the target so wait a bit longer for I/O to - * complete and remove pushed items from the AIL before we - * start the next scan from the start of the AIL. + * We reached the target or the AIL is empty, so wait a bit + * longer for I/O to complete and remove pushed items from the + * AIL before we start the next scan from the start of the AIL. */ tout = 50; ailp->xa_last_pushed_lsn = 0; - } else if ((stuck * 100) / count > 90) { + } else if (((stuck + flushing) * 100) / count > 90) { /* - * Either there is a lot of contention on the AIL or we - * are stuck due to operations in progress. "Stuck" in this - * case is defined as >90% of the items we tried to push - * were stuck. + * Either there is a lot of contention on the AIL or we are + * stuck due to operations in progress. "Stuck" in this case + * is defined as >90% of the items we tried to push were stuck. * * Backoff a bit more to allow some I/O to complete before - * restarting from the start of the AIL. This prevents us - * from spinning on the same items, and if they are pinned will - * all the restart to issue a log force to unpin the stuck - * items. + * restarting from the start of the AIL. This prevents us from + * spinning on the same items, and if they are pinned will all + * the restart to issue a log force to unpin the stuck items. */ tout = 20; ailp->xa_last_pushed_lsn = 0; + } else { + /* + * Assume we have more work to do in a short while. + */ + tout = 10; } return tout; @@ -544,6 +520,8 @@ xfsaild( struct xfs_ail *ailp = data; long tout = 0; /* milliseconds */ + current->flags |= PF_MEMALLOC; + while (!kthread_should_stop()) { if (tout && tout <= 20) __set_current_state(TASK_KILLABLE); @@ -611,6 +589,30 @@ xfs_ail_push_all( } /* + * Push out all items in the AIL immediately and wait until the AIL is empty. + */ +void +xfs_ail_push_all_sync( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip; + DEFINE_WAIT(wait); + + spin_lock(&ailp->xa_lock); + while ((lip = xfs_ail_max(ailp)) != NULL) { + prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE); + ailp->xa_target = lip->li_lsn; + wake_up_process(ailp->xa_task); + spin_unlock(&ailp->xa_lock); + schedule(); + spin_lock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + + finish_wait(&ailp->xa_empty, &wait); +} + +/* * xfs_trans_ail_update - bulk AIL insertion operation. * * @xfs_trans_ail_update takes an array of log items that all need to be @@ -667,11 +669,15 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - spin_unlock(&ailp->xa_lock); - if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { - xlog_assign_tail_lsn(ailp->xa_mount); + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + spin_unlock(&ailp->xa_lock); + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); } } @@ -700,7 +706,8 @@ void xfs_trans_ail_delete_bulk( struct xfs_ail *ailp, struct xfs_log_item **log_items, - int nr_items) __releases(ailp->xa_lock) + int nr_items, + int shutdown_type) __releases(ailp->xa_lock) { xfs_log_item_t *mlip; int mlip_changed = 0; @@ -718,7 +725,7 @@ xfs_trans_ail_delete_bulk( xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_force_shutdown(mp, shutdown_type); } return; } @@ -729,28 +736,20 @@ xfs_trans_ail_delete_bulk( if (mlip == lip) mlip_changed = 1; } - spin_unlock(&ailp->xa_lock); - if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { - xlog_assign_tail_lsn(ailp->xa_mount); + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + spin_unlock(&ailp->xa_lock); + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); } } -/* - * The active item list (AIL) is a doubly linked list of log - * items sorted by ascending lsn. The base of the list is - * a forw/back pointer pair embedded in the xfs mount structure. - * The base is initialized with both pointers pointing to the - * base. This case always needs to be distinguished, because - * the base has no lsn to look at. We almost always insert - * at the end of the list, so on inserts we search from the - * end of the list to find where the new item belongs. - */ - -/* - * Initialize the doubly linked list to point only to itself. - */ int xfs_trans_ail_init( xfs_mount_t *mp) @@ -765,6 +764,8 @@ xfs_trans_ail_init( INIT_LIST_HEAD(&ailp->xa_ail); INIT_LIST_HEAD(&ailp->xa_cursors); spin_lock_init(&ailp->xa_lock); + INIT_LIST_HEAD(&ailp->xa_buf_list); + init_waitqueue_head(&ailp->xa_empty); ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", ailp->xa_mount->m_fsname); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 1302d1d..21c5a5e 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -33,7 +31,6 @@ #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_trace.h" /* @@ -56,7 +53,7 @@ xfs_trans_buf_item_match( if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == blkno && - XFS_BUF_COUNT(blip->bli_buf) == len) + BBTOB(blip->bli_buf->b_length) == len) return blip->bli_buf; } @@ -141,15 +138,11 @@ xfs_trans_get_buf(xfs_trans_t *tp, xfs_buf_t *bp; xfs_buf_log_item_t *bip; - if (flags == 0) - flags = XBF_LOCK | XBF_MAPPED; - /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) - return xfs_buf_get(target_dev, blkno, len, - flags | XBF_DONT_BLOCK); + return xfs_buf_get(target_dev, blkno, len, flags); /* * If we find the buffer in the cache with this transaction @@ -165,14 +158,6 @@ xfs_trans_get_buf(xfs_trans_t *tp, XFS_BUF_DONE(bp); } - /* - * If the buffer is stale then it was binval'ed - * since last read. This doesn't matter since the - * caller isn't allowed to use the data anyway. - */ - else if (XFS_BUF_ISSTALE(bp)) - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - ASSERT(bp->b_transp == tp); bip = bp->b_fspriv; ASSERT(bip != NULL); @@ -182,15 +167,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, return (bp); } - /* - * We always specify the XBF_DONT_BLOCK flag within a transaction - * so that get_buf does not try to push out a delayed write buffer - * which might cause another transaction to take place (if the - * buffer was delayed alloc). Such recursive transactions can - * easily deadlock with our current transaction as well as cause - * us to run out of stack space. - */ - bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_get(target_dev, blkno, len, flags); if (bp == NULL) { return NULL; } @@ -282,14 +259,13 @@ xfs_trans_read_buf( xfs_buf_log_item_t *bip; int error; - if (flags == 0) - flags = XBF_LOCK | XBF_MAPPED; + *bpp = NULL; /* * Default to a normal get_buf() call if the tp is NULL. */ if (tp == NULL) { - bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); @@ -297,6 +273,8 @@ xfs_trans_read_buf( if (bp->b_error) { error = bp->b_error; xfs_buf_ioerror_alert(bp, __func__); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); return error; } @@ -371,15 +349,7 @@ xfs_trans_read_buf( return 0; } - /* - * We always specify the XBF_DONT_BLOCK flag within a transaction - * so that get_buf does not try to push out a delayed write buffer - * which might cause another transaction to take place (if the - * buffer was delayed alloc). Such recursive transactions can - * easily deadlock with our current transaction as well as cause - * us to run out of stack space. - */ - bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); + bp = xfs_buf_read(target, blkno, len, flags); if (bp == NULL) { *bpp = NULL; return (flags & XBF_TRYLOCK) ? @@ -418,19 +388,6 @@ xfs_trans_read_buf( return 0; shutdown_abort: - /* - * the theory here is that buffer is good but we're - * bailing out because the filesystem is being forcibly - * shut down. So we should leave the b_flags alone since - * the buffer's not staled and just get out. - */ -#if defined(DEBUG) - if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) - xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); -#endif - ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) != - (XBF_STALE|XBF_DELWRI)); - trace_xfs_trans_read_buf_shut(bp, _RET_IP_); xfs_buf_relse(bp); *bpp = NULL; @@ -606,7 +563,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); + ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -626,8 +583,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; - xfs_buf_delwri_queue(bp); - trace_xfs_trans_log_buf(bip); /* @@ -651,22 +606,33 @@ xfs_trans_log_buf(xfs_trans_t *tp, /* - * This called to invalidate a buffer that is being used within - * a transaction. Typically this is because the blocks in the - * buffer are being freed, so we need to prevent it from being - * written out when we're done. Allowing it to be written again - * might overwrite data in the free blocks if they are reallocated - * to a file. + * Invalidate a buffer that is being used within a transaction. + * + * Typically this is because the blocks in the buffer are being freed, so we + * need to prevent it from being written out when we're done. Allowing it + * to be written again might overwrite data in the free blocks if they are + * reallocated to a file. + * + * We prevent the buffer from being written out by marking it stale. We can't + * get rid of the buf log item at this point because the buffer may still be + * pinned by another transaction. If that is the case, then we'll wait until + * the buffer is committed to disk for the last time (we can tell by the ref + * count) and free it in xfs_buf_item_unpin(). Until that happens we will + * keep the buffer locked so that the buffer and buf log item are not reused. + * + * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log + * the buf item. This will be used at recovery time to determine that copies + * of the buffer in the log before this should not be replayed. * - * We prevent the buffer from being written out by clearing the - * B_DELWRI flag. We can't always - * get rid of the buf log item at this point, though, because - * the buffer may still be pinned by another transaction. If that - * is the case, then we'll wait until the buffer is committed to - * disk for the last time (we can tell by the ref count) and - * free it in xfs_buf_item_unpin(). Until it is cleaned up we - * will keep the buffer locked so that the buffer and buf log item - * are not reused. + * We mark the item descriptor and the transaction dirty so that we'll hold + * the buffer until after the commit. + * + * Since we're invalidating the buffer, we also clear the state about which + * parts of the buffer have been logged. We also clear the flag indicating + * that this is an inode buffer since the data in the buffer will no longer + * be valid. + * + * We set the stale bit in the buffer as well since we're getting rid of it. */ void xfs_trans_binval( @@ -686,7 +652,6 @@ xfs_trans_binval( * If the buffer is already invalidated, then * just return. */ - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); @@ -696,27 +661,8 @@ xfs_trans_binval( return; } - /* - * Clear the dirty bit in the buffer and set the STALE flag - * in the buf log item. The STALE flag will be used in - * xfs_buf_item_unpin() to determine if it should clean up - * when the last reference to the buf item is given up. - * We set the XFS_BLF_CANCEL flag in the buf log format structure - * and log the buf item. This will be used at recovery time - * to determine that copies of the buffer in the log before - * this should not be replayed. - * We mark the item descriptor and the transaction dirty so - * that we'll hold the buffer until after the commit. - * - * Since we're invalidating the buffer, we also clear the state - * about which parts of the buffer have been logged. We also - * clear the flag indicating that this is an inode buffer since - * the data in the buffer will no longer be valid. - * - * We set the stale bit in the buffer as well since we're getting - * rid of it. - */ xfs_buf_stale(bp); + bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 2790997..bcb6054 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,9 +17,7 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index f7590f5..8d71b16 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -19,7 +19,6 @@ #include "xfs_fs.h" #include "xfs_types.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 7a7442c..d2eee20 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 8ab2ced..fb62377 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -71,6 +71,8 @@ struct xfs_ail { spinlock_t xa_lock; xfs_lsn_t xa_last_pushed_lsn; int xa_log_flush; + struct list_head xa_buf_list; + wait_queue_head_t xa_empty; }; /* @@ -90,18 +92,22 @@ xfs_trans_ail_update( } void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, - struct xfs_log_item **log_items, int nr_items) + struct xfs_log_item **log_items, int nr_items, + int shutdown_type) __releases(ailp->xa_lock); static inline void xfs_trans_ail_delete( struct xfs_ail *ailp, - xfs_log_item_t *lip) __releases(ailp->xa_lock) + xfs_log_item_t *lip, + int shutdown_type) __releases(ailp->xa_lock) { - xfs_trans_ail_delete_bulk(ailp, &lip, 1); + xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); } void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 65584b5..398cf68 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -57,6 +57,7 @@ typedef __uint64_t __psunsigned_t; #endif /* __KERNEL__ */ typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ typedef __uint32_t xfs_agnumber_t; /* allocation group number */ typedef __int32_t xfs_extnum_t; /* # of extents in a file */ @@ -101,6 +102,7 @@ typedef __uint64_t xfs_fileoff_t; /* block number in a file */ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ + /* * Null values for the types. */ @@ -120,6 +122,9 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ #define NULLCOMMITLSN ((xfs_lsn_t)-1) +#define NULLFSINO ((xfs_ino_t)-1) +#define NULLAGINO ((xfs_agino_t)-1) + /* * Max values for extlen, extnum, aextnum. */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 79c05ac..4e5b9ad 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -18,9 +18,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" -#include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 64981d7..b6a82d8 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -21,7 +21,6 @@ #include "xfs_types.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -39,7 +38,6 @@ #include "xfs_bmap.h" #include "xfs_acl.h" #include "xfs_attr.h" -#include "xfs_rw.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_utils.h" @@ -81,8 +79,7 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), - XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; @@ -1919,7 +1916,7 @@ xfs_alloc_file_space( error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_bmap_cancel(&free_list); - xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); @@ -1966,7 +1963,7 @@ xfs_zero_remaining_bytes( bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, - mp->m_sb.sb_blocksize, XBF_DONT_BLOCK); + BTOBB(mp->m_sb.sb_blocksize), 0); if (!bp) return XFS_ERROR(ENOMEM); @@ -2315,17 +2312,33 @@ xfs_change_file_space( case XFS_IOC_ALLOCSP64: case XFS_IOC_FREESP: case XFS_IOC_FREESP64: + /* + * These operations actually do IO when extending the file, but + * the allocation is done seperately to the zeroing that is + * done. This set of operations need to be serialised against + * other IO operations, such as truncate and buffered IO. We + * need to take the IOLOCK here to serialise the allocation and + * zeroing IO to prevent other IOLOCK holders (e.g. getbmap, + * truncate, direct IO) from racing against the transient + * allocated but not written state we can have here. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); if (startoffset > fsize) { error = xfs_alloc_file_space(ip, fsize, - startoffset - fsize, 0, attr_flags); - if (error) + startoffset - fsize, 0, + attr_flags | XFS_ATTR_NOLOCK); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); break; + } } iattr.ia_valid = ATTR_SIZE; iattr.ia_size = startoffset; - error = xfs_setattr_size(ip, &iattr, attr_flags); + error = xfs_setattr_size(ip, &iattr, + attr_flags | XFS_ATTR_NOLOCK); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); if (error) return error; |