diff options
54 files changed, 1321 insertions, 1089 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f252928..f0c9505 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2285,6 +2285,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. The default parameter value of '0' causes the kernel not to attempt recovery of lost locks. + nfs4.layoutstats_timer = + [NFSv4.2] Change the rate at which the kernel sends + layoutstats to the pNFS metadata server. + + Setting this to value to 0 causes the kernel to use + whatever value is the default set by the layout + driver. A non-zero value sets the minimum interval + in seconds between layoutstats transmissions. + nfsd.nfs4_disable_idmapping= [NFSv4] When set to the default of '1', the NFSv4 server will return only numeric uids and gids to diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index bac3fb4..30eb245 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1144,73 +1144,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) } EXPORT_SYMBOL(ib_get_dma_mr); -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_mr *mr; - int err; - - err = ib_check_mr_access(mr_access_flags); - if (err) - return ERR_PTR(err); - - if (!pd->device->reg_phys_mr) - return ERR_PTR(-ENOSYS); - - mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf, - mr_access_flags, iova_start); - - if (!IS_ERR(mr)) { - mr->device = pd->device; - mr->pd = pd; - mr->uobject = NULL; - atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); - } - - return mr; -} -EXPORT_SYMBOL(ib_reg_phys_mr); - -int ib_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_pd *old_pd; - int ret; - - ret = ib_check_mr_access(mr_access_flags); - if (ret) - return ret; - - if (!mr->device->rereg_phys_mr) - return -ENOSYS; - - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - old_pd = mr->pd; - - ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd, - phys_buf_array, num_phys_buf, - mr_access_flags, iova_start); - - if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) { - atomic_dec(&old_pd->usecnt); - atomic_inc(&pd->usecnt); - } - - return ret; -} -EXPORT_SYMBOL(ib_rereg_phys_mr); - int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) { return mr->device->query_mr ? diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 92dca9e..c556640 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -46,13 +46,6 @@ struct pnfs_block_dev; -enum pnfs_block_volume_type { - PNFS_BLOCK_VOLUME_SIMPLE = 0, - PNFS_BLOCK_VOLUME_SLICE = 1, - PNFS_BLOCK_VOLUME_CONCAT = 2, - PNFS_BLOCK_VOLUME_STRIPE = 3, -}; - #define PNFS_BLOCK_MAX_UUIDS 4 #define PNFS_BLOCK_MAX_DEVICES 64 @@ -117,13 +110,6 @@ struct pnfs_block_dev { struct pnfs_block_dev_map *map); }; -enum exstate4 { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ - PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ -}; - /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { union { @@ -134,15 +120,12 @@ struct pnfs_block_extent { sector_t be_f_offset; /* the starting offset in the file */ sector_t be_length; /* the size of the extent */ sector_t be_v_offset; /* the starting offset in the volume */ - enum exstate4 be_state; /* the state of this extent */ + enum pnfs_block_extent_state be_state; /* the state of this extent */ #define EXTENT_WRITTEN 1 #define EXTENT_COMMITTING 2 unsigned int be_tag; }; -/* on the wire size of the extent */ -#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) - struct pnfs_block_layout { struct pnfs_layout_hdr bl_layout; struct rb_root bl_ext_rw; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e535599..a861bbd 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev) kfree(dev->children); } else { if (dev->bdev) - blkdev_put(dev->bdev, FMODE_READ); + blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); } } @@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) return -EIO; p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); b->simple.sigs[i].sig_len = be32_to_cpup(p++); + if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) { + pr_info("signature too long: %d\n", + b->simple.sigs[i].sig_len); + return -EIO; + } p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); if (!p) @@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, if (!dev) return -EIO; - d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(d->bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 31d0b5e..c59a59c 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -462,6 +462,12 @@ out: return err; } +static size_t ext_tree_layoutupdate_size(size_t count) +{ + return sizeof(__be32) /* number of entries */ + + PNFS_BLOCK_EXTENT_SIZE * count; +} + static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, size_t buffer_size) { @@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, continue; (*count)++; - if (*count * BL_EXTENT_SIZE > buffer_size) { + if (ext_tree_layoutupdate_size(*count) > buffer_size) { /* keep counting.. */ ret = -ENOSPC; continue; @@ -530,7 +536,7 @@ retry: if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); - buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + buffer_size = ext_tree_layoutupdate_size(count); count = 0; arg->layoutupdate_pages = @@ -549,17 +555,14 @@ retry: } *start_p = cpu_to_be32(count); - arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + arg->layoutupdate_len = ext_tree_layoutupdate_size(count); if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { - __be32 *p = start_p; + void *p = start_p, *end = p + arg->layoutupdate_len; int i = 0; - for (p = start_p; - p < start_p + arg->layoutupdate_len; - p += PAGE_SIZE) { + for ( ; p < end; p += PAGE_SIZE) arg->layoutupdate_pages[i++] = vmalloc_to_page(p); - } } dprintk("%s found %zu ranges\n", __func__, count); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 2c4a0b56..75f7c0a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv) spin_lock_init(&serv->sv_cb_lock); init_waitqueue_head(&serv->sv_cb_waitq); rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(rqstp)) { - svc_xprt_put(serv->sv_bc_xprt); - serv->sv_bc_xprt = NULL; - } dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); return rqstp; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 29e3c1b..b85cf7a 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL, + -ntohl(res->status)); goto out; + } nfsi = NFS_I(inode); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); @@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, res->status = 0; out_iput: rcu_read_unlock(); + trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); @@ -194,6 +198,7 @@ unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); + trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); iput(ino); out: return rv; @@ -554,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy, status = htonl(NFS4_OK); nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid); - nfs41_server_notify_target_slotid_update(cps->clp); + nfs41_notify_server(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4a90c9b..57c5a02 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -20,6 +20,7 @@ #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> @@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs_put_client); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/* - * Test if two ip6 socket addresses refer to the same socket by - * comparing relevant fields. The padding bytes specifically, are not - * compared. sin6_flowinfo is not compared because it only affects QoS - * and sin6_scope_id is only compared if the address is "link local" - * because "link local" addresses need only be unique to a specific - * link. Conversely, ordinary unicast addresses might have different - * sin6_scope_id. - * - * The caller should ensure both socket addresses are AF_INET6. - */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) - return 0; - else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) - return sin1->sin6_scope_id == sin2->sin6_scope_id; - - return 1; -} -#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - return 0; -} -#endif - -/* - * Test if two ip4 socket addresses refer to the same socket, by - * comparing relevant fields. The padding bytes specifically, are - * not compared. - * - * The caller should ensure both socket addresses are AF_INET. - */ -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; -} - -static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1; - const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2; - - return nfs_sockaddr_match_ipaddr6(sa1, sa2) && - (sin1->sin6_port == sin2->sin6_port); -} - -static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1; - const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2; - - return nfs_sockaddr_match_ipaddr4(sa1, sa2) && - (sin1->sin_port == sin2->sin_port); -} - -#if defined(CONFIG_NFS_V4_1) -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, excluding the port number. - */ -int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) - return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_match_ipaddr4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_match_ipaddr6(sa1, sa2); - } - return 0; -} -EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr); -#endif /* CONFIG_NFS_V4_1 */ - -/* - * Test if two socket addresses represent the same actual socket, - * by comparing (only) relevant fields, including the port number. - */ -static int nfs_sockaddr_cmp(const struct sockaddr *sa1, - const struct sockaddr *sa2) -{ - if (sa1->sa_family != sa2->sa_family) - return 0; - - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_cmp_ip4(sa1, sa2); - case AF_INET6: - return nfs_sockaddr_cmp_ip6(sa1, sa2); - } - return 0; -} - /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. @@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat if (clp->cl_minorversion != data->minorversion) continue; /* Match the full socket address */ - if (!nfs_sockaddr_cmp(sap, clap)) + if (!rpc_cmp_addr_port(sap, clap)) continue; atomic_inc(&clp->cl_count); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 029d688..2714ef8 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, if (delegation->inode != NULL) { nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; + delegation->pagemod_limit = res->pagemod_limit; oldcred = delegation->cred; delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, @@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct return -ENOMEM; nfs4_stateid_copy(&delegation->stateid, &res->delegation); delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; + delegation->pagemod_limit = res->pagemod_limit; delegation->change_attr = inode->i_version; delegation->cred = get_rpccred(cred); delegation->inode = inode; @@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, rcu_read_unlock(); return ret; } + +/** + * nfs4_delegation_flush_on_close - Check if we must flush file on close + * @inode: inode to check + * + * This function checks the number of outstanding writes to the file + * against the delegation 'space_limit' field to see if + * the spec requires us to flush the file on close. + */ +bool nfs4_delegation_flush_on_close(const struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_delegation *delegation; + bool ret = true; + + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || !(delegation->type & FMODE_WRITE)) + goto out; + if (nfsi->nrequests < delegation->pagemod_limit) + ret = false; +out: + rcu_read_unlock(); + return ret; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index e3c20a3..a448291 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -18,7 +18,7 @@ struct nfs_delegation { struct inode *inode; nfs4_stateid stateid; fmode_t type; - loff_t maxsize; + unsigned long pagemod_limit; __u64 change_attr; unsigned long flags; spinlock_t lock; @@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); +bool nfs4_delegation_flush_on_close(const struct inode *inode); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 547308a..3d8e4ff 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -583,26 +583,19 @@ out_nopages: } static -void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages) +void nfs_readdir_free_pages(struct page **pages, unsigned int npages) { unsigned int i; for (i = 0; i < npages; i++) put_page(pages[i]); } -static -void nfs_readdir_free_large_page(void *ptr, struct page **pages, - unsigned int npages) -{ - nfs_readdir_free_pagearray(pages, npages); -} - /* * nfs_readdir_large_page will allocate pages that must be freed with a call - * to nfs_readdir_free_large_page + * to nfs_readdir_free_pagearray */ static -int nfs_readdir_large_page(struct page **pages, unsigned int npages) +int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) { unsigned int i; @@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages) return 0; out_freepages: - nfs_readdir_free_pagearray(pages, i); + nfs_readdir_free_pages(pages, i); return -ENOMEM; } @@ -623,7 +616,6 @@ static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { struct page *pages[NFS_MAX_READDIR_PAGES]; - void *pages_ptr = NULL; struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; @@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; - status = nfs_readdir_large_page(pages, array_size); + status = nfs_readdir_alloc_pages(pages, array_size); if (status < 0) goto out_release_array; do { @@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, } } while (array->eof_index < 0); - nfs_readdir_free_large_page(pages_ptr, pages, array_size); + nfs_readdir_free_pages(pages, array_size); out_release_array: nfs_readdir_release_array(page); out_label_free: diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cc4fa1e..c0f9b1e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp) dprintk("NFS: release(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); - return nfs_release(inode, filp); + nfs_file_clear_open_context(filp); + return 0; } EXPORT_SYMBOL_GPL(nfs_file_release); @@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek); /* * Flush all dirty pages, and check for write errors. */ -int +static int nfs_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); @@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) if ((file->f_mode & FMODE_WRITE) == 0) return 0; - /* - * If we're holding a write delegation, then just start the i/o - * but don't wait for completion (or send a commit). - */ - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - return filemap_fdatawrite(file->f_mapping); - /* Flush writes to the server and return any errors */ return vfs_fsync(file, 0); } -EXPORT_SYMBOL_GPL(nfs_file_flush); ssize_t nfs_file_read(struct kiocb *iocb, struct iov_iter *to) @@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_sync_write(struct file *filp, struct inode *inode) +static int nfs_need_check_write(struct file *filp, struct inode *inode) { struct nfs_open_context *ctx; - if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC)) - return 1; ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || nfs_ctx_key_to_expire(ctx)) @@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result > 0) written = result; - /* Return error values for O_DSYNC and IS_SYNC() */ - if (result >= 0 && nfs_need_sync_write(file, inode)) { + /* Return error values */ + if (result >= 0 && nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b3289d7..fbc5a56 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) ffl = kzalloc(sizeof(*ffl), gfp_flags); if (ffl) { INIT_LIST_HEAD(&ffl->error_list); + INIT_LIST_HEAD(&ffl->mirrors); return &ffl->generic_hdr; } else return NULL; @@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id) return 0; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + int i, j; + + if (m1->fh_versions_cnt != m2->fh_versions_cnt) + return false; + for (i = 0; i < m1->fh_versions_cnt; i++) { + bool found_fh = false; + for (j = 0; j < m2->fh_versions_cnt; i++) { + if (nfs_compare_fh(&m1->fh_versions[i], + &m2->fh_versions[j]) == 0) { + found_fh = true; + break; + } + } + if (!found_fh) + return false; + } + return true; +} + +static struct nfs4_ff_layout_mirror * +ff_layout_add_mirror(struct pnfs_layout_hdr *lo, + struct nfs4_ff_layout_mirror *mirror) +{ + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); + struct nfs4_ff_layout_mirror *pos; + struct inode *inode = lo->plh_inode; + + spin_lock(&inode->i_lock); + list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { + if (mirror->mirror_ds != pos->mirror_ds) + continue; + if (!ff_mirror_match_fh(mirror, pos)) + continue; + if (atomic_inc_not_zero(&pos->ref)) { + spin_unlock(&inode->i_lock); + return pos; + } + } + list_add(&mirror->mirrors, &ff_layout->mirrors); + mirror->layout = lo; + spin_unlock(&inode->i_lock); + return mirror; +} + +static void +ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + struct inode *inode; + if (mirror->layout == NULL) + return; + inode = mirror->layout->plh_inode; + spin_lock(&inode->i_lock); + list_del(&mirror->mirrors); + spin_unlock(&inode->i_lock); + mirror->layout = NULL; +} + +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +{ + struct nfs4_ff_layout_mirror *mirror; + + mirror = kzalloc(sizeof(*mirror), gfp_flags); + if (mirror != NULL) { + spin_lock_init(&mirror->lock); + atomic_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + } + return mirror; +} + +static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + ff_layout_remove_mirror(mirror); + kfree(mirror->fh_versions); + if (mirror->cred) + put_rpccred(mirror->cred); + nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + kfree(mirror); +} + +static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror != NULL && atomic_dec_and_test(&mirror->ref)) + ff_layout_free_mirror(mirror); +} + static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) { int i; @@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) /* normally mirror_ds is freed in * .free_deviceid_node but we still do it here * for .alloc_lseg error path */ - if (fls->mirror_array[i]) { - kfree(fls->mirror_array[i]->fh_versions); - nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); - kfree(fls->mirror_array[i]); - } + ff_layout_put_mirror(fls->mirror_array[i]); } kfree(fls->mirror_array); fls->mirror_array = NULL; @@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) } } +static bool +ff_lseg_range_is_after(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 end1, end2; + + if (l1->iomode != l2->iomode) + return l1->iomode != IOMODE_READ; + end1 = pnfs_calc_offset_end(l1->offset, l1->length); + end2 = pnfs_calc_offset_end(l2->offset, l2->length); + if (end1 < l2->offset) + return false; + if (end2 < l1->offset) + return true; + return l2->offset <= l1->offset; +} + +static bool +ff_lseg_merge(struct pnfs_layout_segment *new, + struct pnfs_layout_segment *old) +{ + u64 new_end, old_end; + + if (new->pls_range.iomode != old->pls_range.iomode) + return false; + old_end = pnfs_calc_offset_end(old->pls_range.offset, + old->pls_range.length); + if (old_end < new->pls_range.offset) + return false; + new_end = pnfs_calc_offset_end(new->pls_range.offset, + new->pls_range.length); + if (new_end < old->pls_range.offset) + return false; + + /* Mergeable: copy info from 'old' to 'new' */ + if (new_end < old_end) + new_end = old_end; + if (new->pls_range.offset < old->pls_range.offset) + new->pls_range.offset = old->pls_range.offset; + new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset, + new_end); + if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) + set_bit(NFS_LSEG_ROC, &new->pls_flags); + if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) + set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags); + return true; +} + +static void +ff_layout_add_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + pnfs_generic_layout_insert_lseg(lo, lseg, + ff_lseg_range_is_after, + ff_lseg_merge, + free_me); +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { int i, j; @@ -246,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; for (i = 0; i < fls->mirror_array_cnt; i++) { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; struct nfs4_deviceid_node *idnode; u32 ds_count; @@ -262,17 +408,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (ds_count != 1) goto out_err_free; - fls->mirror_array[i] = - kzalloc(sizeof(struct nfs4_ff_layout_mirror), - gfp_flags); + fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - spin_lock_init(&fls->mirror_array[i]->lock); fls->mirror_array[i]->ds_count = ds_count; - fls->mirror_array[i]->lseg = &fls->generic_hdr; /* deviceid */ rc = decode_deviceid(&stream, &devid); @@ -338,6 +480,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, if (rc) goto out_err_free; + mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); + if (mirror != fls->mirror_array[i]) { + ff_layout_free_mirror(fls->mirror_array[i]); + fls->mirror_array[i] = mirror; + } + dprintk("%s: uid %d gid %d\n", __func__, fls->mirror_array[i]->uid, fls->mirror_array[i]->gid); @@ -379,21 +527,9 @@ static void ff_layout_free_lseg(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); - int i; dprintk("--> %s\n", __func__); - for (i = 0; i < fls->mirror_array_cnt; i++) { - if (fls->mirror_array[i]) { - nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); - fls->mirror_array[i]->mirror_ds = NULL; - if (fls->mirror_array[i]->cred) { - put_rpccred(fls->mirror_array[i]->cred); - fls->mirror_array[i]->cred = NULL; - } - } - } - if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_flexfile_layout *ffl; struct inode *inode; @@ -419,48 +555,44 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) } static void -nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer) +nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { /* first IO request? */ if (atomic_inc_return(&timer->n_ops) == 1) { - timer->start_time = ktime_get(); + timer->start_time = now; } } static ktime_t -nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer) +nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { - ktime_t start, now; + ktime_t start; if (atomic_dec_return(&timer->n_ops) < 0) WARN_ON_ONCE(1); - now = ktime_get(); start = timer->start_time; timer->start_time = now; return ktime_sub(now, start); } -static ktime_t -nfs4_ff_layout_calc_completion_time(struct rpc_task *task) -{ - return ktime_sub(ktime_get(), task->tk_start); -} - static bool nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, - struct nfs4_ff_layoutstat *layoutstat) + struct nfs4_ff_layoutstat *layoutstat, + ktime_t now) { static const ktime_t notime = {0}; - ktime_t now = ktime_get(); + s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; - nfs4_ff_start_busy_timer(&layoutstat->busy_timer); + nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); if (ktime_equal(mirror->start_time, notime)) mirror->start_time = now; if (ktime_equal(mirror->last_report_time, notime)) mirror->last_report_time = now; + if (layoutstats_timer != 0) + report_interval = (s64)layoutstats_timer * 1000LL; if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= - FF_LAYOUTSTATS_REPORT_INTERVAL) { + report_interval) { mirror->last_report_time = now; return true; } @@ -482,35 +614,39 @@ static void nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, __u64 requested, __u64 completed, - ktime_t time_completed) + ktime_t time_completed, + ktime_t time_started) { struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + ktime_t completion_time = ktime_sub(time_completed, time_started); ktime_t timer; iostat->ops_completed++; iostat->bytes_completed += completed; iostat->bytes_not_delivered += requested - completed; - timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer); + timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed); iostat->total_busy_time = ktime_add(iostat->total_busy_time, timer); iostat->aggregate_completion_time = - ktime_add(iostat->aggregate_completion_time, time_completed); + ktime_add(iostat->aggregate_completion_time, + completion_time); } static void -nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror, - __u64 requested) +nfs4_ff_layout_stat_io_start_read(struct inode *inode, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat); + report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); + pnfs_report_layoutstat(inode, GFP_KERNEL); } static void @@ -522,23 +658,24 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, spin_lock(&mirror->lock); nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, requested, completed, - nfs4_ff_layout_calc_completion_time(task)); + ktime_get(), task->tk_start); spin_unlock(&mirror->lock); } static void -nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror, - __u64 requested) +nfs4_ff_layout_stat_io_start_write(struct inode *inode, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat); + report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); + pnfs_report_layoutstat(inode, GFP_NOIO); } static void @@ -553,8 +690,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, spin_lock(&mirror->lock); nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, - requested, completed, - nfs4_ff_layout_calc_completion_time(task)); + requested, completed, ktime_get(), task->tk_start); spin_unlock(&mirror->lock); } @@ -728,8 +864,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); return 1; } @@ -931,18 +1065,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, if (task->tk_status >= 0) return 0; - if (task->tk_status != -EJUKEBOX) { + switch (task->tk_status) { + /* File access problems. Don't mark the device as unavailable */ + case -EACCES: + case -ESTALE: + case -EISDIR: + case -EBADHANDLE: + case -ELOOP: + case -ENOSPC: + break; + case -EJUKEBOX: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + goto out_retry; + default: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_mark_deviceid_unavailable(devid); - if (ff_layout_has_available_ds(lseg)) - return -NFS4ERR_RESET_TO_PNFS; - else - return -NFS4ERR_RESET_TO_MDS; } - - if (task->tk_status == -EJUKEBOX) - nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + /* FIXME: Need to prevent infinite looping here. */ + return -NFS4ERR_RESET_TO_PNFS; +out_retry: task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); @@ -972,15 +1114,41 @@ static int ff_layout_async_handle_error(struct rpc_task *task, static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, int idx, u64 offset, u64 length, - u32 status, int opnum) + u32 status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; int err; + if (status == 0) { + switch (error) { + case -ETIMEDOUT: + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + case -EOPNOTSUPP: + case -ECONNREFUSED: + case -ECONNRESET: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EADDRINUSE: + case -ENOBUFS: + case -EPIPE: + case -EPERM: + status = NFS4ERR_NXIO; + break; + case -EACCES: + status = NFS4ERR_ACCESS; + break; + default: + return; + } + } + mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, GFP_NOIO); + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -989,16 +1157,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode; int err; trace_nfs4_pnfs_read(hdr, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) - hdr->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && hdr->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, hdr->args.offset, hdr->args.count, - hdr->res.op_status, OP_READ); + hdr->res.op_status, OP_READ, + task->tk_status); err = ff_layout_async_handle_error(task, hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); @@ -1010,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - inode = hdr->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, hdr->lseg); ff_layout_reset_read(hdr); return task->tk_status; case -EAGAIN: @@ -1061,9 +1225,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) static int ff_layout_read_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { - nfs4_ff_layout_stat_io_start_read( + nfs4_ff_layout_stat_io_start_read(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count); + hdr->args.count, + task->tk_start); if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); @@ -1163,32 +1328,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode; int err; trace_nfs4_pnfs_write(hdr, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) - hdr->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && hdr->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, hdr->args.offset, hdr->args.count, - hdr->res.op_status, OP_WRITE); + hdr->res.op_status, OP_WRITE, + task->tk_status); err = ff_layout_async_handle_error(task, hdr->args.context->state, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); switch (err) { case -NFS4ERR_RESET_TO_PNFS: + pnfs_set_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, true); + return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - inode = hdr->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, hdr->lseg); - if (err == -NFS4ERR_RESET_TO_PNFS) { - pnfs_set_retry_layoutget(hdr->lseg->pls_layout); - ff_layout_reset_write(hdr, true); - } else { - pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); - ff_layout_reset_write(hdr, false); - } + pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, false); return task->tk_status; case -EAGAIN: rpc_restart_call_prepare(task); @@ -1199,34 +1358,35 @@ static int ff_layout_write_done_cb(struct rpc_task *task, hdr->res.verf->committed == NFS_DATA_SYNC) ff_layout_set_layoutcommit(hdr); + /* zero out fattr since we don't care DS attr at all */ + hdr->fattr.valid = 0; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; } static int ff_layout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { - struct inode *inode; int err; trace_nfs4_pnfs_commit_ds(data, task->tk_status); - if (task->tk_status == -ETIMEDOUT && !data->res.op_status) - data->res.op_status = NFS4ERR_NXIO; - if (task->tk_status < 0 && data->res.op_status) + if (task->tk_status < 0) ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, data->args.offset, data->args.count, - data->res.op_status, OP_COMMIT); + data->res.op_status, OP_COMMIT, + task->tk_status); err = ff_layout_async_handle_error(task, NULL, data->ds_clp, data->lseg, data->ds_commit_index); switch (err) { case -NFS4ERR_RESET_TO_PNFS: + pnfs_set_retry_layoutget(data->lseg->pls_layout); + pnfs_generic_prepare_to_resend_writes(data); + return -EAGAIN; case -NFS4ERR_RESET_TO_MDS: - inode = data->lseg->pls_layout->plh_inode; - pnfs_error_mark_layout_for_return(inode, data->lseg); - if (err == -NFS4ERR_RESET_TO_PNFS) - pnfs_set_retry_layoutget(data->lseg->pls_layout); - else - pnfs_clear_retry_layoutget(data->lseg->pls_layout); + pnfs_clear_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -EAGAIN: @@ -1244,9 +1404,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, static int ff_layout_write_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { - nfs4_ff_layout_stat_io_start_write( + nfs4_ff_layout_stat_io_start_write(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count); + hdr->args.count, + task->tk_start); if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); @@ -1325,9 +1486,9 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) static void ff_layout_commit_prepare_common(struct rpc_task *task, struct nfs_commit_data *cdata) { - nfs4_ff_layout_stat_io_start_write( + nfs4_ff_layout_stat_io_start_write(cdata->inode, FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), - 0); + 0, task->tk_start); } static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) @@ -1842,53 +2003,55 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, *start = cpu_to_be32((xdr->p - start - 1) * 4); } -static bool +static int ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, - struct pnfs_layout_segment *pls, - int *dev_count, int dev_limit) + struct pnfs_layout_hdr *lo, + int dev_limit) { + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *dev; struct nfs42_layoutstat_devinfo *devinfo; - int i; + int i = 0; - for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) { - if (*dev_count >= dev_limit) + list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { + if (i >= dev_limit) break; - mirror = FF_LAYOUT_COMP(pls, i); - if (!mirror || !mirror->mirror_ds) + if (!mirror->mirror_ds) + continue; + /* mirror refcount put in cleanup_layoutstats */ + if (!atomic_inc_not_zero(&mirror->ref)) continue; - dev = FF_LAYOUT_DEVID_NODE(pls, i); - devinfo = &args->devinfo[*dev_count]; + dev = &mirror->mirror_ds->id_node; + devinfo = &args->devinfo[i]; memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); - devinfo->offset = pls->pls_range.offset; - devinfo->length = pls->pls_range.length; - /* well, we don't really know if IO is continuous or not! */ - devinfo->read_count = mirror->read_stat.io_stat.bytes_completed; + devinfo->offset = 0; + devinfo->length = NFS4_MAX_UINT64; + devinfo->read_count = mirror->read_stat.io_stat.ops_completed; devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; - devinfo->write_count = mirror->write_stat.io_stat.bytes_completed; + devinfo->write_count = mirror->write_stat.io_stat.ops_completed; devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; devinfo->layout_type = LAYOUT_FLEX_FILES; devinfo->layoutstats_encode = ff_layout_encode_layoutstats; devinfo->layout_private = mirror; - /* lseg refcount put in cleanup_layoutstats */ - pnfs_get_lseg(pls); - ++(*dev_count); + i++; } - - return *dev_count < dev_limit; + return i; } static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { - struct pnfs_layout_segment *pls; + struct nfs4_flexfile_layout *ff_layout; + struct nfs4_ff_layout_mirror *mirror; int dev_count = 0; spin_lock(&args->inode->i_lock); - list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { - dev_count += FF_LAYOUT_MIRROR_COUNT(pls); + ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); + list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { + if (atomic_read(&mirror->ref) != 0) + dev_count ++; } spin_unlock(&args->inode->i_lock); /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ @@ -1897,20 +2060,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); dev_count = PNFS_LAYOUTSTATS_MAXDEV; } - args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL); + args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO); if (!args->devinfo) return -ENOMEM; - dev_count = 0; spin_lock(&args->inode->i_lock); - list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { - if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count, - PNFS_LAYOUTSTATS_MAXDEV)) { - break; - } - } + args->num_dev = ff_layout_mirror_prepare_stats(args, + &ff_layout->generic_hdr, dev_count); spin_unlock(&args->inode->i_lock); - args->num_dev = dev_count; return 0; } @@ -1924,7 +2081,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data) for (i = 0; i < data->args.num_dev; i++) { mirror = data->args.devinfo[i].layout_private; data->args.devinfo[i].layout_private = NULL; - pnfs_put_lseg(mirror->lseg); + ff_layout_put_mirror(mirror); } } @@ -1936,6 +2093,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .free_layout_hdr = ff_layout_free_layout_hdr, .alloc_lseg = ff_layout_alloc_lseg, .free_lseg = ff_layout_free_lseg, + .add_lseg = ff_layout_add_lseg, .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index f92f9a0..68cc0d9 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -67,7 +67,8 @@ struct nfs4_ff_layoutstat { }; struct nfs4_ff_layout_mirror { - struct pnfs_layout_segment *lseg; /* back pointer */ + struct pnfs_layout_hdr *layout; + struct list_head mirrors; u32 ds_count; u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; @@ -77,6 +78,7 @@ struct nfs4_ff_layout_mirror { u32 uid; u32 gid; struct rpc_cred *cred; + atomic_t ref; spinlock_t lock; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; @@ -95,6 +97,7 @@ struct nfs4_ff_layout_segment { struct nfs4_flexfile_layout { struct pnfs_layout_hdr generic_hdr; struct pnfs_ds_commit_info commit_info; + struct list_head mirrors; struct list_head error_list; /* nfs4_ff_layout_ds_err */ }; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index f13e196..e125e55 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -172,6 +172,32 @@ out_err: return NULL; } +static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, + struct nfs4_deviceid_node *devid) +{ + nfs4_mark_deviceid_unavailable(devid); + if (!ff_layout_has_available_ds(lseg)) + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); +} + +static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror == NULL || mirror->mirror_ds == NULL) { + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); + return false; + } + if (mirror->mirror_ds->ds == NULL) { + struct nfs4_deviceid_node *devid; + devid = &mirror->mirror_ds->id_node; + ff_layout_mark_devid_invalid(lseg, devid); + return false; + } + return true; +} + static u64 end_offset(u64 start, u64 len) { @@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); struct nfs_fh *fh = NULL; - struct nfs4_deviceid_node *devid; - if (mirror == NULL || mirror->mirror_ds == NULL || - mirror->mirror_ds->ds == NULL) { - printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n", + if (!ff_layout_mirror_valid(lseg, mirror)) { + pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", __func__, mirror_idx); - if (mirror && mirror->mirror_ds) { - devid = &mirror->mirror_ds->id_node; - pnfs_generic_mark_devid_invalid(devid); - } goto out; } @@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, unsigned int max_payload; rpc_authflavor_t flavor; - if (mirror == NULL || mirror->mirror_ds == NULL || - mirror->mirror_ds->ds == NULL) { - printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", + if (!ff_layout_mirror_valid(lseg, mirror)) { + pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", __func__, ds_idx); - if (mirror && mirror->mirror_ds) { - devid = &mirror->mirror_ds->id_node; - pnfs_generic_mark_devid_invalid(devid); - } goto out; } @@ -500,16 +515,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, range->offset, range->length)) continue; /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) - * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4) + * + array length + deviceid(NFS4_DEVICEID4_SIZE) + * + status(4) + opnum(4) */ p = xdr_reserve_space(xdr, - 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); + 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); if (unlikely(!p)) return -ENOBUFS; p = xdr_encode_hyper(p, err->offset); p = xdr_encode_hyper(p, err->length); p = xdr_encode_opaque_fixed(p, &err->stateid, NFS4_STATEID_SIZE); + /* Encode 1 error */ + *p++ = cpu_to_be32(1); p = xdr_encode_opaque_fixed(p, &err->deviceid, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(err->status); @@ -525,11 +543,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, return 0; } -bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - int idx; + u32 idx; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); @@ -543,6 +561,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) return false; } +static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *devid; + u32 idx; + + for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { + mirror = FF_LAYOUT_COMP(lseg, idx); + if (!mirror || !mirror->mirror_ds) + return false; + devid = &mirror->mirror_ds->id_node; + if (ff_layout_test_devid_unavailable(devid)) + return false; + } + + return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; +} + +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_READ) + return ff_read_layout_has_available_ds(lseg); + /* Note: RW layout needs all mirrors available */ + return ff_rw_layout_has_available_ds(lseg); +} + module_param(dataserver_retrans, uint, 0644); MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " "retries a request before it attempts further " diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0adc7d2..326d9e1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -504,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; - int error = -ENOMEM; + int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -513,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { - loff_t i_size; - BUG_ON(!S_ISREG(inode->i_mode)); - i_size = i_size_read(inode); - if (attr->ia_size == i_size) + error = inode_newsize_ok(inode, attr->ia_size); + if (error) + return error; + + if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; - else if (attr->ia_size < i_size && IS_SWAPFILE(inode)) - return -ETXTBSY; } /* Optimization: if the end result is no change, don't RPC */ @@ -536,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); - if (fattr == NULL) + if (fattr == NULL) { + error = -ENOMEM; goto out; + } + /* * Return any delegations if we're going to change ACLs */ @@ -759,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context); * @ctx: pointer to context * @is_sync: is this a synchronous close * - * always ensure that the attributes are up to date if we're mounted - * with close-to-open semantics + * Ensure that the attributes are up to date if we're mounted + * with close-to-open semantics and we have cached data that will + * need to be revalidated on open. */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) { + struct nfs_inode *nfsi; struct inode *inode; struct nfs_server *server; @@ -772,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) if (!is_sync) return; inode = d_inode(ctx->dentry); - if (!list_empty(&NFS_I(inode)->open_files)) + nfsi = NFS_I(inode); + if (inode->i_mapping->nrpages == 0) + return; + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return; + if (!list_empty(&nfsi->open_files)) return; server = NFS_SERVER(inode); if (server->flags & NFS_MOUNT_NOCTO) @@ -844,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx) } EXPORT_SYMBOL_GPL(put_nfs_open_context); +static void put_nfs_open_context_sync(struct nfs_open_context *ctx) +{ + __put_nfs_open_context(ctx, 1); +} + /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages @@ -888,7 +902,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c return ctx; } -static void nfs_file_clear_open_context(struct file *filp) +void nfs_file_clear_open_context(struct file *filp) { struct nfs_open_context *ctx = nfs_file_open_context(filp); @@ -899,7 +913,7 @@ static void nfs_file_clear_open_context(struct file *filp) spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); spin_unlock(&inode->i_lock); - __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); + put_nfs_open_context_sync(ctx); } } @@ -919,12 +933,6 @@ int nfs_open(struct inode *inode, struct file *filp) return 0; } -int nfs_release(struct inode *inode, struct file *filp) -{ - nfs_file_clear_open_context(filp); - return 0; -} - /* * This function is called whenever some part of NFS notices that * the cached attributes have to be refreshed. @@ -1273,13 +1281,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; } -static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) -{ - if (!(fattr->valid & NFS_ATTR_FATTR_CTIME)) - return 0; - return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; -} - static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) @@ -1428,7 +1429,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n const struct nfs_inode *nfsi = NFS_I(inode); return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || - nfs_ctime_need_update(inode, fattr) || ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } @@ -1491,6 +1491,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr { unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + /* + * Don't revalidate the pagecache if we hold a delegation, but do + * force an attribute update + */ + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED; + if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9b372b8..56cfde2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void) } #endif -#ifdef CONFIG_NFS_V4_1 -int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); -#endif - /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; @@ -364,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *) /* file.c */ int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); loff_t nfs_file_llseek(struct file *, loff_t, int); -int nfs_file_flush(struct file *, fl_owner_t); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); @@ -490,6 +485,9 @@ void nfs_retry_commit(struct list_head *page_list, void nfs_commitdata_release(struct nfs_commit_data *data); void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo); +void nfs_request_add_commit_list_locked(struct nfs_page *req, + struct list_head *dst, + struct nfs_commit_info *cinfo); void nfs_request_remove_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_init_cinfo(struct nfs_commit_info *cinfo, @@ -623,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) * Record the page as unstable and mark its inode as dirty. */ static inline -void nfs_mark_page_unstable(struct page *page) +void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) { - struct inode *inode = page_file_mapping(page)->host; + if (!cinfo->dreq) { + struct inode *inode = page_file_mapping(page)->host; - inc_zone_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + } } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 9b04c2e..267126d 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, { encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); encode_symlinkdata3(xdr, args); + xdr->buf->flags |= XDRBUF_WRITE; } /* diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index ff66ae7..814c125 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); -/* nfs4.2xdr.h */ -extern struct rpc_procinfo nfs4_2_procedures[]; #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index a6bd27d..0eb29e1 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -238,8 +238,7 @@ out_overflow: return -EIO; } -static int decode_layoutstats(struct xdr_stream *xdr, - struct nfs42_layoutstat_res *res) +static int decode_layoutstats(struct xdr_stream *xdr) { return decode_op_hdr(xdr, OP_LAYOUTSTATS); } @@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp, goto out; WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV); for (i = 0; i < res->num_dev; i++) { - status = decode_layoutstats(xdr, res); + status = decode_layoutstats(xdr); if (status) goto out; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ea3bee9..50cfc4c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -405,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, struct rpc_cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); -extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp); -extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp); - +extern void nfs41_notify_server(struct nfs_client *); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 3aa6a9b..223bedd 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr, return false; /* Match only the IP address, not the port number */ - if (!nfs_sockaddr_match_ipaddr(addr, clap)) - return false; - - return true; + return rpc_cmp_addr(addr, clap); } /* diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index dcd39d4..b0dbe0a 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -6,7 +6,9 @@ #include <linux/fs.h> #include <linux/falloc.h> #include <linux/nfs_fs.h> +#include "delegation.h" #include "internal.h" +#include "iostat.h" #include "fscache.h" #include "pnfs.h" @@ -27,7 +29,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) struct inode *dir; unsigned openflags = filp->f_flags; struct iattr attr; - int opened = 0; int err; /* @@ -66,7 +67,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) nfs_sync_inode(inode); } - inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); switch (err) { @@ -100,6 +101,31 @@ out_drop: goto out_put_ctx; } +/* + * Flush all dirty pages, and check for write errors. + */ +static int +nfs4_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + dprintk("NFS: flush(%pD2)\n", file); + + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; + + /* + * If we're holding a write delegation, then check if we're required + * to flush the i/o on close. If not, then just start the i/o now. + */ + if (!nfs4_delegation_flush_on_close(inode)) + return filemap_fdatawrite(file->f_mapping); + + /* Flush writes to the server and return any errors */ + return vfs_fsync(file, 0); +} + static int nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -178,7 +204,7 @@ const struct file_operations nfs4_file_operations = { .write_iter = nfs_file_write, .mmap = nfs_file_mmap, .open = nfs4_file_open, - .flush = nfs_file_flush, + .flush = nfs4_file_flush, .release = nfs_file_release, .fsync = nfs4_file_fsync, .lock = nfs_lock, diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 535dfc6..2e49022 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = { .read = user_read, }; -static int nfs_idmap_init_keyring(void) +int nfs_idmap_init(void) { struct cred *cred; struct key *keyring; @@ -230,7 +230,7 @@ failed_put_cred: return ret; } -static void nfs_idmap_quit_keyring(void) +void nfs_idmap_quit(void) { key_revoke(id_resolver_cache->thread_keyring); unregister_key_type(&key_type_id_resolver); @@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp) kfree(idmap); } -int nfs_idmap_init(void) -{ - return nfs_idmap_init_keyring(); -} - -void nfs_idmap_quit(void) -{ - nfs_idmap_quit_keyring(); -} - static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, struct idmap_msg *im, struct rpc_pipe_msg *msg) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3acb1eb..693b903 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -586,7 +586,7 @@ out_unlock: spin_unlock(&tbl->slot_tbl_lock); res->sr_slot = NULL; if (send_new_highest_used_slotid) - nfs41_server_notify_highest_slotid_update(session->clp); + nfs41_notify_server(session->clp); } int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -1150,7 +1150,8 @@ out: return ret; } -static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) +static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, + enum open_claim_type4 claim) { if (delegation == NULL) return 0; @@ -1158,6 +1159,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) return 0; if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) return 0; + switch (claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) + break; + default: + return 0; + } nfs_mark_delegation_referenced(delegation); return 1; } @@ -1220,6 +1231,7 @@ static void nfs_resync_open_stateid_locked(struct nfs4_state *state) } static void nfs_clear_open_stateid_locked(struct nfs4_state *state, + nfs4_stateid *arg_stateid, nfs4_stateid *stateid, fmode_t fmode) { clear_bit(NFS_O_RDWR_STATE, &state->flags); @@ -1238,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, if (stateid == NULL) return; /* Handle races with OPEN */ - if (!nfs4_stateid_match_other(stateid, &state->open_stateid) || - !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { + if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) || + (nfs4_stateid_match_other(stateid, &state->open_stateid) && + !nfs4_stateid_is_newer(stateid, &state->open_stateid))) { nfs_resync_open_stateid_locked(state); return; } @@ -1248,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, nfs4_stateid_copy(&state->open_stateid, stateid); } -static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_clear_open_stateid(struct nfs4_state *state, + nfs4_stateid *arg_stateid, + nfs4_stateid *stateid, fmode_t fmode) { write_seqlock(&state->seqlock); - nfs_clear_open_stateid_locked(state, stateid, fmode); + nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode); write_sequnlock(&state->seqlock); if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) nfs4_schedule_state_manager(state->owner->so_server->nfs_client); @@ -1376,6 +1391,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs_delegation *delegation; int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; + enum open_claim_type4 claim = opendata->o_arg.claim; nfs4_stateid stateid; int ret = -EAGAIN; @@ -1389,7 +1405,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) spin_unlock(&state->owner->so_lock); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (!can_open_delegated(delegation, fmode)) { + if (!can_open_delegated(delegation, fmode, claim)) { rcu_read_unlock(); break; } @@ -1852,6 +1868,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) struct nfs4_opendata *data = calldata; struct nfs4_state_owner *sp = data->owner; struct nfs_client *clp = sp->so_server->nfs_client; + enum open_claim_type4 claim = data->o_arg.claim; if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) goto out_wait; @@ -1866,15 +1883,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); - if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && - data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH && - can_open_delegated(delegation, data->o_arg.fmode)) + if (can_open_delegated(delegation, data->o_arg.fmode, claim)) goto unlock_no_action; rcu_read_unlock(); } /* Update client id. */ data->o_arg.clientid = clp->cl_clientid; - switch (data->o_arg.claim) { + switch (claim) { + default: + break; case NFS4_OPEN_CLAIM_PREVIOUS: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: @@ -2294,15 +2311,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st * fields corresponding to attributes that were used to store the verifier. * Make sure we clobber those fields in the later setattr call */ -static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr) +static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, + struct iattr *sattr, struct nfs4_label **label) { - if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) && + const u32 *attrset = opendata->o_res.attrset; + + if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) && !(sattr->ia_valid & ATTR_ATIME_SET)) sattr->ia_valid |= ATTR_ATIME; - if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) && + if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) && !(sattr->ia_valid & ATTR_MTIME_SET)) sattr->ia_valid |= ATTR_MTIME; + + /* Except MODE, it seems harmless of setting twice. */ + if ((attrset[1] & FATTR4_WORD1_MODE)) + sattr->ia_valid &= ~ATTR_MODE; + + if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) + *label = NULL; } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, @@ -2425,9 +2452,9 @@ static int _nfs4_do_open(struct inode *dir, goto err_free_label; state = ctx->state; - if ((opendata->o_arg.open_flags & O_EXCL) && + if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { - nfs4_exclusive_attrset(opendata, sattr); + nfs4_exclusive_attrset(opendata, sattr, &label); nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, @@ -2439,7 +2466,7 @@ static int _nfs4_do_open(struct inode *dir, nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } - if (opendata->file_created) + if (opened && opendata->file_created) *opened |= FILE_CREATED; if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { @@ -2661,7 +2688,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: res_stateid = &calldata->res.stateid; - if (calldata->arg.fmode == 0 && calldata->roc) + if (calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); renew_lease(server, calldata->timestamp); @@ -2684,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data) goto out_release; } } - nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode); + nfs_clear_open_stateid(state, &calldata->arg.stateid, + res_stateid, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); @@ -2735,14 +2763,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_no_action; } - if (calldata->arg.fmode == 0) { + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; - if (calldata->roc && - pnfs_roc_drain(inode, &calldata->roc_barrier, task)) { - nfs_release_seqid(calldata->arg.seqid); - goto out_wait; - } - } + if (calldata->roc) + pnfs_roc_get_barrier(inode, &calldata->roc_barrier); + calldata->arg.share_access = nfs4_map_atomic_open_share(NFS_SERVER(inode), calldata->arg.fmode, 0); @@ -2883,8 +2908,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { + u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion; struct nfs4_server_caps_arg args = { .fhandle = fhandle, + .bitmask = bitmask, }; struct nfs4_server_caps_res res = {}; struct rpc_message msg = { @@ -2894,10 +2921,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f }; int status; + bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS | + FATTR4_WORD0_FH_EXPIRE_TYPE | + FATTR4_WORD0_LINK_SUPPORT | + FATTR4_WORD0_SYMLINK_SUPPORT | + FATTR4_WORD0_ACLSUPPORT; + if (minorversion) + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { /* Sanity check the server answers */ - switch (server->nfs_client->cl_minorversion) { + switch (minorversion) { case 0: res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK; res.attr_bitmask[2] = 0; @@ -2950,6 +2985,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->cache_consistency_bitmask[2] = 0; + memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask, + sizeof(server->exclcreat_bitmask)); server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -3552,7 +3589,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; - int opened = 0; int status = 0; ctx = alloc_nfs_open_context(dentry, FMODE_READ); @@ -3562,7 +3598,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened); + state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL); if (IS_ERR(state)) { status = PTR_ERR(state); goto out; @@ -4978,13 +5014,12 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) int result; size_t len; char *str; - bool retried = false; if (clp->cl_owner_id != NULL) return 0; -retry: + rcu_read_lock(); - len = 10 + strlen(clp->cl_ipaddr) + 1 + + len = 14 + strlen(clp->cl_ipaddr) + 1 + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + 1 + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + @@ -5010,14 +5045,6 @@ retry: rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); rcu_read_unlock(); - /* Did something change? */ - if (result >= len) { - kfree(str); - if (retried) - return -EINVAL; - retried = true; - goto retry; - } clp->cl_owner_id = str; return 0; } @@ -5049,10 +5076,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) clp->rpc_ops->version, clp->cl_minorversion, nfs4_client_id_uniquifier, clp->cl_rpcclient->cl_nodename); - if (result >= len) { - kfree(str); - return -EINVAL; - } clp->cl_owner_id = str; return 0; } @@ -5088,10 +5111,6 @@ nfs4_init_uniform_client_string(struct nfs_client *clp) result = scnprintf(str, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, clp->cl_rpcclient->cl_nodename); - if (result >= len) { - kfree(str); - return -EINVAL; - } clp->cl_owner_id = str; return 0; } @@ -5289,9 +5308,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (d_data->roc && - pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task)) - return; + if (d_data->roc) + pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier); nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, @@ -7746,10 +7764,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) case 0: goto out; /* + * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of + * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). + */ + case -NFS4ERR_BADLAYOUT: + goto out_overflow; + /* * NFS4ERR_LAYOUTTRYLATER is a conflict with another client - * (or clients) writing to the same RAID stripe + * (or clients) writing to the same RAID stripe except when + * the minlength argument is 0 (see RFC5661 section 18.43.3). */ case -NFS4ERR_LAYOUTTRYLATER: + if (lgp->args.minlength == 0) + goto out_overflow; /* * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall * existing layout before getting a new one). @@ -7805,6 +7832,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) rpc_restart_call_prepare(task); out: dprintk("<-- %s\n", __func__); + return; +out_overflow: + task->tk_status = -EOVERFLOW; + goto out; } static size_t max_response_pages(struct nfs_server *server) @@ -8661,6 +8692,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, + .mig_recovery_ops = &nfs41_mig_recovery_ops, }; #endif diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f2e2ad8..da73bc4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2152,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); -static void nfs41_ping_server(struct nfs_client *clp) +void nfs41_notify_server(struct nfs_client *clp) { /* Use CHECK_LEASE to ping the server with a SEQUENCE */ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); nfs4_schedule_state_manager(clp); } -void nfs41_server_notify_target_slotid_update(struct nfs_client *clp) -{ - nfs41_ping_server(clp); -} - -void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp) -{ - nfs41_ping_server(clp); -} - static void nfs4_reset_all_state(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 470af1a..28df12e 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -884,6 +884,66 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr); DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root); DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo); +DECLARE_EVENT_CLASS(nfs4_inode_callback_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs_fh *fhandle, + const struct inode *inode, + int error + ), + + TP_ARGS(clp, fhandle, inode, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fhandle); + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + } else { + __entry->fileid = 0; + __entry->dev = 0; + } + __assign_str(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __get_str(dstaddr) + ) +); + +#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_callback_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + const struct nfs_fh *fhandle, \ + const struct inode *inode, \ + int error \ + ), \ + TP_ARGS(clp, fhandle, inode, error)) +DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr); +DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode); + + DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( const char *name, @@ -1136,6 +1196,7 @@ TRACE_EVENT(nfs4_layoutget, DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); +DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 558cd65d..788adf3 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int); #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ - 1 /* FIXME: opaque lrf_body always empty at the moment */) + 1 + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 1 + decode_stateid_maxsz) #define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) @@ -1001,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, - const struct nfs_server *server) + const struct nfs_server *server, + bool excl_check) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1067,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; } + + if (excl_check) { + const u32 *excl_bmval = server->exclcreat_bitmask; + bmval[0] &= excl_bmval[0]; + bmval[1] &= excl_bmval[1]; + bmval[2] &= excl_bmval[2]; + + if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) + label = NULL; + } + if (label) { len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; @@ -1154,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * case NF4LNK: p = reserve_space(xdr, 4); *p = cpu_to_be32(create->u.symlink.len); - xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); + xdr_write_pages(xdr, create->u.symlink.pages, 0, + create->u.symlink.len); + xdr->buf->flags |= XDRBUF_WRITE; break; case NF4BLK: case NF4CHR: @@ -1168,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server); + encode_attrs(xdr, create->attrs, create->label, create->server, false); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1382,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { - struct iattr dummy; __be32 *p; p = reserve_space(xdr, 4); switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1402,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - dummy.ia_valid = 0; - encode_attrs(xdr, &dummy, arg->label, arg->server); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true); } } @@ -1659,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server); + encode_attrs(xdr, arg->iap, arg->label, server, false); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -2580,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, struct xdr_stream *xdr, struct nfs4_server_caps_arg *args) { + const u32 *bitmask = args->bitmask; struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; @@ -2587,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| - FATTR4_WORD0_FH_EXPIRE_TYPE| - FATTR4_WORD0_LINK_SUPPORT| - FATTR4_WORD0_SYMLINK_SUPPORT| - FATTR4_WORD0_ACLSUPPORT, &hdr); + encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr); encode_nops(&hdr); } @@ -3368,6 +3378,22 @@ out_overflow: return -EIO; } +static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, + uint32_t *bitmap, uint32_t *bitmask) +{ + if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) { + int ret; + ret = decode_attr_bitmap(xdr, bitmask); + if (unlikely(ret < 0)) + return ret; + bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT; + } else + bitmask[0] = bitmask[1] = bitmask[2] = 0; + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, + bitmask[0], bitmask[1], bitmask[2]); + return 0; +} + static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) { __be32 *p; @@ -4321,6 +4347,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_exclcreat_supported(xdr, bitmap, + res->exclcreat_bitmask)) != 0) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d!\n", __func__, -status); @@ -4903,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr) } /* This is too sick! */ -static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) +static int decode_space_limit(struct xdr_stream *xdr, + unsigned long *pagemod_limit) { __be32 *p; uint32_t limit_type, nblocks, blocksize; + u64 maxsize = 0; p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; limit_type = be32_to_cpup(p++); switch (limit_type) { - case 1: - xdr_decode_hyper(p, maxsize); + case NFS4_LIMIT_SIZE: + xdr_decode_hyper(p, &maxsize); break; - case 2: + case NFS4_LIMIT_BLOCKS: nblocks = be32_to_cpup(p++); blocksize = be32_to_cpup(p); - *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; + maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } + maxsize >>= PAGE_CACHE_SHIFT; + *pagemod_limit = min_t(u64, maxsize, ULONG_MAX); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4948,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, break; case NFS4_OPEN_DELEGATE_WRITE: res->delegation_type = FMODE_WRITE|FMODE_READ; - if (decode_space_limit(xdr, &res->maxsize) < 0) + if (decode_space_limit(xdr, &res->pagemod_limit) < 0) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 4984bbe..7c5718b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { spin_lock(&hdr->lock); - if (pos < hdr->io_start + hdr->good_bytes) { - set_bit(NFS_IOHDR_ERROR, &hdr->flags); + if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags) + || pos < hdr->io_start + hdr->good_bytes) { clear_bit(NFS_IOHDR_EOF, &hdr->flags); hdr->good_bytes = pos - hdr->io_start; hdr->error = error; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 70bf706..ba12464 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -368,7 +368,6 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; lo->plh_return_iomode = 0; - lo->plh_block_lgets++; pnfs_get_layout_hdr(lo); clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); return true; @@ -817,25 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); } -static bool -pnfs_layout_returning(const struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range) -{ - return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && - (lo->plh_return_iomode == IOMODE_ANY || - lo->plh_return_iomode == range->iomode); -} - /* lget is set to 1 if called from inside send_layoutget call chain */ static bool -pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, int lget) +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) { return lo->plh_block_lgets || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - (list_empty(&lo->plh_segs) && - (atomic_read(&lo->plh_outstanding) > lget)) || - pnfs_layout_returning(lo, range); + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } int @@ -847,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo, range, 1)) { + if (pnfs_layoutgets_blocked(lo)) { status = -EAGAIN; } else if (!nfs4_valid_open_stateid(open_state)) { status = -EBADF; @@ -882,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; struct pnfs_layout_segment *lseg; + loff_t i_size; dprintk("--> %s\n", __func__); @@ -889,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo, if (lgp == NULL) return NULL; + i_size = i_size_read(ino); + lgp->args.minlength = PAGE_CACHE_SIZE; if (lgp->args.minlength > range->length) lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.range = *range; lgp->args.type = server->pnfs_curr_ld->id; @@ -956,9 +951,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); - lo->plh_block_lgets--; pnfs_clear_layoutreturn_waitbit(lo); - rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); spin_unlock(&ino->i_lock); pnfs_put_layout_hdr(lo); goto out; @@ -1080,15 +1073,14 @@ bool pnfs_roc(struct inode *ino) struct pnfs_layout_segment *lseg, *tmp; nfs4_stateid stateid; LIST_HEAD(tmp_list); - bool found = false, layoutreturn = false; + bool found = false, layoutreturn = false, roc = false; spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) goto out_noroc; - /* Don't return layout if we hold a delegation */ + /* no roc if we hold a delegation */ if (nfs4_check_delegation(ino, FMODE_READ)) goto out_noroc; @@ -1099,34 +1091,41 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } + stateid = lo->plh_stateid; + /* always send layoutreturn if being marked so */ + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); + pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { + /* If we are sending layoutreturn, invalidate all valid lsegs */ + if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { mark_lseg_invalid(lseg, &tmp_list); found = true; } - if (!found) - goto out_noroc; - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ - spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&tmp_list); - pnfs_layoutcommit_inode(ino, true); - return true; + /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put + * in pnfs_roc_release(). We don't really send a layoutreturn but + * still want others to view us like we are sending one! + * + * If pnfs_prepare_layoutreturn() fails, it means someone else is doing + * LAYOUTRETURN, so we proceed like there are no layouts to return. + * + * ROC in three conditions: + * 1. there are ROC lsegs + * 2. we don't send layoutreturn + * 3. no others are sending layoutreturn + */ + if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo)) + roc = true; out_noroc: - if (lo) { - stateid = lo->plh_stateid; - if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags)) - layoutreturn = pnfs_prepare_layoutreturn(lo); - } spin_unlock(&ino->i_lock); - if (layoutreturn) { - pnfs_layoutcommit_inode(ino, true); + pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); + if (layoutreturn) pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); - } - return false; + return roc; } void pnfs_roc_release(struct inode *ino) @@ -1135,7 +1134,7 @@ void pnfs_roc_release(struct inode *ino) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - lo->plh_block_lgets--; + pnfs_clear_layoutreturn_waitbit(lo); if (atomic_dec_and_test(&lo->plh_refcount)) { pnfs_detach_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -1153,27 +1152,16 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); + trace_nfs4_layoutreturn_on_close(ino, 0); } -bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *lo; - struct pnfs_layout_segment *lseg; - nfs4_stateid stateid; u32 current_seqid; - bool layoutreturn = false; spin_lock(&ino->i_lock); - list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) { - if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) - continue; - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) - continue; - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - spin_unlock(&ino->i_lock); - return true; - } lo = nfsi->layout; current_seqid = be32_to_cpu(lo->plh_stateid.seqid); @@ -1181,19 +1169,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) * a barrier, we choose the worst-case barrier. */ *barrier = current_seqid + atomic_read(&lo->plh_outstanding); - stateid = lo->plh_stateid; - if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags)) - layoutreturn = pnfs_prepare_layoutreturn(lo); - if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - spin_unlock(&ino->i_lock); - if (layoutreturn) { - pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); - return true; - } - return false; } /* @@ -1221,16 +1197,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); } -static void -pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, - struct pnfs_layout_segment *lseg) +static bool +pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) { - struct pnfs_layout_segment *lp; + return pnfs_lseg_range_cmp(l1, l2) > 0; +} + +static bool +pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg, + struct pnfs_layout_segment *old) +{ + return false; +} + +void +pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + bool (*is_after)(const struct pnfs_layout_range *, + const struct pnfs_layout_range *), + bool (*do_merge)(struct pnfs_layout_segment *, + struct pnfs_layout_segment *), + struct list_head *free_me) +{ + struct pnfs_layout_segment *lp, *tmp; dprintk("%s:Begin\n", __func__); - list_for_each_entry(lp, &lo->plh_segs, pls_list) { - if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0) + list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0) + continue; + if (do_merge(lseg, lp)) { + mark_lseg_invalid(lp, free_me); + continue; + } + if (is_after(&lseg->pls_range, &lp->pls_range)) continue; list_add_tail(&lseg->pls_list, &lp->pls_list); dprintk("%s: inserted lseg %p " @@ -1252,6 +1253,24 @@ out: dprintk("%s:Return\n", __func__); } +EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg); + +static void +pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + struct inode *inode = lo->plh_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + + if (ld->add_lseg != NULL) + ld->add_lseg(lo, lseg, free_me); + else + pnfs_generic_layout_insert_lseg(lo, lseg, + pnfs_lseg_range_is_after, + pnfs_lseg_no_merge, + free_me); +} static struct pnfs_layout_hdr * alloc_init_layout_hdr(struct inode *ino, @@ -1344,8 +1363,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, ret = pnfs_get_lseg(lseg); break; } - if (lseg->pls_range.offset > range->offset) - break; } dprintk("%s:Return lseg %p ref %d\n", @@ -1438,6 +1455,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) { + if (!pnfs_should_retry_layoutget(lo)) + return false; /* * send layoutcommit as it can hold up layoutreturn due to lseg * reference @@ -1484,6 +1503,9 @@ pnfs_update_layout(struct inode *ino, if (!pnfs_enabled_sb(NFS_SERVER(ino))) goto out; + if (iomode == IOMODE_READ && i_size_read(ino) == 0) + goto out; + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) goto out; @@ -1533,8 +1555,7 @@ lookup_again: * Because we free lsegs before sending LAYOUTRETURN, we need to wait * for LAYOUTRETURN even if first is true. */ - if (!lseg && pnfs_should_retry_layoutget(lo) && - test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { spin_unlock(&ino->i_lock); dprintk("%s wait for layoutreturn\n", __func__); if (pnfs_prepare_to_retry_layoutget(lo)) { @@ -1547,7 +1568,7 @@ lookup_again: goto out_put_layout_hdr; } - if (pnfs_layoutgets_blocked(lo, &arg, 0)) + if (pnfs_layoutgets_blocked(lo)) goto out_unlock; atomic_inc(&lo->plh_outstanding); spin_unlock(&ino->i_lock); @@ -1593,6 +1614,26 @@ out_unlock: } EXPORT_SYMBOL_GPL(pnfs_update_layout); +static bool +pnfs_sanity_check_layout_range(struct pnfs_layout_range *range) +{ + switch (range->iomode) { + case IOMODE_READ: + case IOMODE_RW: + break; + default: + return false; + } + if (range->offset == NFS4_MAX_UINT64) + return false; + if (range->length == 0) + return false; + if (range->length != NFS4_MAX_UINT64 && + range->length > NFS4_MAX_UINT64 - range->offset) + return false; + return true; +} + struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { @@ -1601,7 +1642,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; LIST_HEAD(free_me); - int status = 0; + int status = -EINVAL; + + if (!pnfs_sanity_check_layout_range(&res->range)) + goto out; /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); @@ -1619,12 +1663,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) lseg->pls_range = res->range; spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - dprintk("%s forget reply due to recall\n", __func__); - goto out_forget_reply; - } - - if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) { + if (pnfs_layoutgets_blocked(lo)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } @@ -1651,12 +1690,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); pnfs_get_lseg(lseg); - pnfs_layout_insert_lseg(lo, lseg); + pnfs_layout_insert_lseg(lo, lseg, &free_me); - if (res->return_on_close) { + if (res->return_on_close) set_bit(NFS_LSEG_ROC, &lseg->pls_flags); - set_bit(NFS_LAYOUT_ROC, &lo->plh_flags); - } spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me); @@ -1692,6 +1729,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg->pls_range.length); set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); mark_lseg_invalid(lseg, tmp_list); + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags); } } @@ -2267,7 +2306,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) #if IS_ENABLED(CONFIG_NFS_V4_2) int -pnfs_report_layoutstat(struct inode *inode) +pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; struct nfs_server *server = NFS_SERVER(inode); @@ -2294,7 +2333,7 @@ pnfs_report_layoutstat(struct inode *inode) pnfs_get_layout_hdr(hdr); spin_unlock(&inode->i_lock); - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), gfp_flags); if (!data) { status = -ENOMEM; goto out_put; @@ -2324,3 +2363,7 @@ out_put: } EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); #endif + +unsigned int layoutstats_timer; +module_param(layoutstats_timer, uint, 0644); +EXPORT_SYMBOL_GPL(layoutstats_timer); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 3e6ab7b..78c9351 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -94,7 +94,6 @@ enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ - NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_RETURN, /* Return this layout ASAP */ NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ @@ -129,6 +128,9 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); void (*free_lseg) (struct pnfs_layout_segment *lseg); + void (*add_lseg) (struct pnfs_layout_hdr *layoutid, + struct pnfs_layout_segment *lseg, + struct list_head *free_me); void (*return_range) (struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range); @@ -184,15 +186,15 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_hdr { atomic_t plh_refcount; + atomic_t plh_outstanding; /* number of RPCs out */ struct list_head plh_layouts; /* other client layouts */ struct list_head plh_bulk_destroy; struct list_head plh_segs; /* layout segments list */ - nfs4_stateid plh_stateid; - atomic_t plh_outstanding; /* number of RPCs out */ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ - u32 plh_barrier; /* ignore lower seqids */ unsigned long plh_retry_timestamp; unsigned long plh_flags; + nfs4_stateid plh_stateid; + u32 plh_barrier; /* ignore lower seqids */ enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ @@ -267,7 +269,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); -bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); +void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier); void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); @@ -286,6 +288,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, gfp_t gfp_flags); void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); +void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, + bool (*is_after)(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *old), + bool (*do_merge)(struct pnfs_layout_segment *lseg, + struct pnfs_layout_segment *old), + struct list_head *free_me); + void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); @@ -529,12 +539,31 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, nfss->pnfs_curr_ld->id == src->l_type); } +static inline u64 +pnfs_calc_offset_end(u64 offset, u64 len) +{ + if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset) + return NFS4_MAX_UINT64; + return offset + len - 1; +} + +static inline u64 +pnfs_calc_offset_length(u64 offset, u64 end) +{ + if (end == NFS4_MAX_UINT64 || end <= offset) + return NFS4_MAX_UINT64; + return 1 + end - offset; +} + +extern unsigned int layoutstats_timer; + #ifdef NFS_DEBUG void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); #else static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) { } + #endif /* NFS_DEBUG */ #else /* CONFIG_NFS_V4_1 */ @@ -605,10 +634,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier) { } -static inline bool -pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) +static inline void +pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { - return false; } static inline void set_pnfs_layoutdriver(struct nfs_server *s, @@ -691,10 +719,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) #endif /* CONFIG_NFS_V4_1 */ #if IS_ENABLED(CONFIG_NFS_V4_2) -int pnfs_report_layoutstat(struct inode *inode); +int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags); #else static inline int -pnfs_report_layoutstat(struct inode *inode) +pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) { return 0; } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index f37e25b..24655b8 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; - bucket->clseg = bucket->wlseg; - if (list_empty(src)) + if (bucket->clseg == NULL) + bucket->clseg = pnfs_get_lseg(bucket->wlseg); + if (list_empty(src)) { + pnfs_put_lseg_locked(bucket->wlseg); bucket->wlseg = NULL; - else - pnfs_get_lseg(bucket->clseg); + } } return ret; } @@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; + LIST_HEAD(pages); int i; + spin_lock(cinfo->lock); for (i = idx; i < fl_cinfo->nbuckets; i++) { bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) continue; - nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i); - spin_lock(cinfo->lock); freeme = bucket->clseg; bucket->clseg = NULL; + list_splice_init(&bucket->committing, &pages); spin_unlock(cinfo->lock); + nfs_retry_commit(&pages, freeme, cinfo, i); pnfs_put_lseg(freeme); + spin_lock(cinfo->lock); } + spin_unlock(cinfo->lock); } static unsigned int @@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, if (!data) break; data->ds_commit_index = i; - spin_lock(cinfo->lock); - data->lseg = bucket->clseg; - bucket->clseg = NULL; - spin_unlock(cinfo->lock); list_add(&data->pages, list); nreq++; } @@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, return nreq; } +static inline +void pnfs_fetch_commit_bucket_list(struct list_head *pages, + struct nfs_commit_data *data, + struct nfs_commit_info *cinfo) +{ + struct pnfs_commit_bucket *bucket; + + bucket = &cinfo->ds->buckets[data->ds_commit_index]; + spin_lock(cinfo->lock); + list_splice_init(&bucket->committing, pages); + data->lseg = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + +} + /* This follows nfs_commit_list pretty closely */ int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, @@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, if (!list_empty(mds_pages)) { data = nfs_commitdata_alloc(); if (data != NULL) { - data->lseg = NULL; + data->ds_commit_index = -1; list_add(&data->pages, &list); nreq++; } else { @@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); - if (!data->lseg) { + if (data->ds_commit_index < 0) { nfs_init_commit(data, mds_pages, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), data->mds_ops, how, 0); } else { - struct pnfs_commit_bucket *buckets; + LIST_HEAD(pages); - buckets = cinfo->ds->buckets; - nfs_init_commit(data, - &buckets[data->ds_commit_index].committing, - data->lseg, - cinfo); + pnfs_fetch_commit_bucket_list(&pages, data, cinfo); + nfs_init_commit(data, &pages, data->lseg, cinfo); initiate_commit(data, how); } } @@ -359,26 +373,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) return false; } +/* + * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does, + * declare a match. + */ static bool _same_data_server_addrs_locked(const struct list_head *dsaddrs1, const struct list_head *dsaddrs2) { struct nfs4_pnfs_ds_addr *da1, *da2; - - /* step through both lists, comparing as we go */ - for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), - da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); - da1 != NULL && da2 != NULL; - da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), - da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { - if (!same_sockaddr((struct sockaddr *)&da1->da_addr, - (struct sockaddr *)&da2->da_addr)) - return false; + struct sockaddr *sa1, *sa2; + bool match = false; + + list_for_each_entry(da1, dsaddrs1, da_node) { + sa1 = (struct sockaddr *)&da1->da_addr; + match = false; + list_for_each_entry(da2, dsaddrs2, da_node) { + sa2 = (struct sockaddr *)&da2->da_addr; + match = same_sockaddr(sa1, sa2); + if (match) + break; + } + if (!match) + break; } - if (da1 == NULL && da2 == NULL) - return true; - - return false; + return match; } /* @@ -863,9 +882,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; - spin_unlock(cinfo->lock); - nfs_request_add_commit_list(req, list, cinfo); + nfs_request_add_commit_list_locked(req, list, cinfo); + spin_unlock(cinfo->lock); + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index aa62004..383a027 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -381,9 +381,12 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - register_shrinker(&acl_shrinker); + ret = register_shrinker(&acl_shrinker); + if (ret < 0) + goto error_3; return 0; - +error_3: + nfs_unregister_sysctl(); error_2: unregister_nfs4_fs(); error_1: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 75a35a1..388f480 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -768,6 +768,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, } /** + * nfs_request_add_commit_list_locked - add request to a commit list + * @req: pointer to a struct nfs_page + * @dst: commit list head + * @cinfo: holds list lock and accounting info + * + * This sets the PG_CLEAN bit, updates the cinfo count of + * number of outstanding requests requiring a commit as well as + * the MM page stats. + * + * The caller must hold the cinfo->lock, and the nfs_page lock. + */ +void +nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + set_bit(PG_CLEAN, &req->wb_flags); + nfs_list_add_request(req, dst); + cinfo->mds->ncommit++; +} +EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); + +/** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page * @dst: commit list head @@ -784,13 +806,10 @@ void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo) { - set_bit(PG_CLEAN, &(req)->wb_flags); spin_lock(cinfo->lock); - nfs_list_add_request(req, dst); - cinfo->mds->ncommit++; + nfs_request_add_commit_list_locked(req, dst, cinfo); spin_unlock(cinfo->lock); - if (!cinfo->dreq) - nfs_mark_page_unstable(req->wb_page); + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -1793,7 +1812,7 @@ out_mark_dirty: return res; } -static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct nfs_inode *nfsi = NFS_I(inode); int flags = FLUSH_SYNC; @@ -1828,11 +1847,6 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } - -int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - return nfs_commit_unstable_pages(inode, wbc); -} EXPORT_SYMBOL_GPL(nfs_write_inode); /* diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 9aa2796..6d834dc 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, } nr_iomaps = be32_to_cpup(p++); - expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE; + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; if (len != expected) { dprintk("%s: extent array size mismatch: %u/%u\n", __func__, len, expected); diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index fdc7903..6de925f 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -7,13 +7,6 @@ struct iomap; struct xdr_stream; -enum pnfs_block_extent_state { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, - PNFS_BLOCK_NONE_DATA = 3, -}; - struct pnfs_block_extent { struct nfsd4_deviceid vol_id; u64 foff; @@ -21,14 +14,6 @@ struct pnfs_block_extent { u64 soff; enum pnfs_block_extent_state es; }; -#define NFS4_BLOCK_EXTENT_SIZE 44 - -enum pnfs_block_volume_type { - PNFS_BLOCK_VOLUME_SIMPLE = 0, - PNFS_BLOCK_VOLUME_SLICE = 1, - PNFS_BLOCK_VOLUME_CONCAT = 2, - PNFS_BLOCK_VOLUME_STRIPE = 3, -}; /* * Random upper cap for the uuid length to avoid unbounded allocation. diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index b8e72aa..00121f2 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -547,6 +547,24 @@ enum pnfs_notify_deviceid_type4 { NOTIFY_DEVICEID4_DELETE = 1 << 2, }; +enum pnfs_block_volume_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, +}; + +enum pnfs_block_extent_state { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, + PNFS_BLOCK_NONE_DATA = 3, +}; + +/* on the wire size of a block layout extent */ +#define PNFS_BLOCK_EXTENT_SIZE \ + (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) + #define NFL4_UFLG_MASK 0x0000003F #define NFL4_UFLG_DENSE 0x00000001 #define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 874b772..c0e9614 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -353,7 +353,6 @@ extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct inode *, int); extern int nfs_open(struct inode *, struct file *); -extern int nfs_release(struct inode *, struct file *); extern int nfs_attribute_timeout(struct inode *inode); extern int nfs_attribute_cache_expired(struct inode *inode); extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); @@ -371,6 +370,7 @@ extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struc extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode); extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx); extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); +extern void nfs_file_clear_open_context(struct file *flip); extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 20bc8e5..570a7df 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -173,6 +173,11 @@ struct nfs_server { set of attributes supported on this filesystem excluding the label support bit. */ + u32 exclcreat_bitmask[3]; + /* V4 bitmask representing the + set of attributes supported + on this filesystem for the + exclusive create. */ u32 cache_consistency_bitmask[3]; /* V4 bitmask representing the subset of change attribute, size, ctime diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7bbe505..52faf7e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -379,7 +379,7 @@ struct nfs_openargs { struct stateowner_id id; union { struct { - struct iattr * attrs; /* UNCHECKED, GUARDED */ + struct iattr * attrs; /* UNCHECKED, GUARDED, EXCLUSIVE4_1 */ nfs4_verifier verifier; /* EXCLUSIVE */ }; nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ @@ -389,7 +389,7 @@ struct nfs_openargs { const struct nfs_server *server; /* Needed for ID mapping */ const u32 * bitmask; const u32 * open_bitmap; - __u32 claim; + enum open_claim_type4 claim; enum createmode4 createmode; const struct nfs4_label *label; }; @@ -406,8 +406,8 @@ struct nfs_openres { const struct nfs_server *server; fmode_t delegation_type; nfs4_stateid delegation; + unsigned long pagemod_limit; __u32 do_recall; - __u64 maxsize; __u32 attrset[NFS4_BITMAP_SIZE]; struct nfs4_string *owner; struct nfs4_string *group_owner; @@ -1057,11 +1057,13 @@ struct nfs4_statfs_res { struct nfs4_server_caps_arg { struct nfs4_sequence_args seq_args; struct nfs_fh *fhandle; + const u32 * bitmask; }; struct nfs4_server_caps_res { struct nfs4_sequence_res seq_res; u32 attr_bitmask[3]; + u32 exclcreat_bitmask[3]; u32 acl_bitmask; u32 has_links; u32 has_symlinks; diff --git a/include/linux/sunrpc/addr.h b/include/linux/sunrpc/addr.h index 07d8e53..5c9c6cd 100644 --- a/include/linux/sunrpc/addr.h +++ b/include/linux/sunrpc/addr.h @@ -46,8 +46,8 @@ static inline void rpc_set_port(struct sockaddr *sap, #define IPV6_SCOPE_DELIMITER '%' #define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn") -static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1, - const struct sockaddr *sap2) +static inline bool rpc_cmp_addr4(const struct sockaddr *sap1, + const struct sockaddr *sap2) { const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; @@ -67,8 +67,8 @@ static inline bool __rpc_copy_addr4(struct sockaddr *dst, } #if IS_ENABLED(CONFIG_IPV6) -static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, - const struct sockaddr *sap2) +static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, + const struct sockaddr *sap2) { const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; @@ -93,7 +93,7 @@ static inline bool __rpc_copy_addr6(struct sockaddr *dst, return true; } #else /* !(IS_ENABLED(CONFIG_IPV6) */ -static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1, +static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { return false; @@ -122,15 +122,28 @@ static inline bool rpc_cmp_addr(const struct sockaddr *sap1, if (sap1->sa_family == sap2->sa_family) { switch (sap1->sa_family) { case AF_INET: - return __rpc_cmp_addr4(sap1, sap2); + return rpc_cmp_addr4(sap1, sap2); case AF_INET6: - return __rpc_cmp_addr6(sap1, sap2); + return rpc_cmp_addr6(sap1, sap2); } } return false; } /** + * rpc_cmp_addr_port - compare the address and port number of two sockaddrs. + * @sap1: first sockaddr + * @sap2: second sockaddr + */ +static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1, + const struct sockaddr *sap2) +{ + if (!rpc_cmp_addr(sap1, sap2)) + return false; + return rpc_get_port(sap1) == rpc_get_port(sap2); +} + +/** * rpc_copy_addr - copy the address portion of one sockaddr to another * @dst: destination sockaddr * @src: source sockaddr diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index a7cbb57..1ecf13e 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -18,9 +18,13 @@ #include <linux/atomic.h> #include <linux/rcupdate.h> #include <linux/uidgid.h> +#include <linux/utsname.h> -/* size of the nodename buffer */ -#define UNX_MAXNODENAME 32 +/* + * Size of the nodename buffer. RFC1831 specifies a hard limit of 255 bytes, + * but Linux hostnames are actually limited to __NEW_UTS_LEN bytes. + */ +#define UNX_MAXNODENAME __NEW_UTS_LEN struct rpcsec_gss_info; diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index b176130..b7b279b 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -49,7 +49,7 @@ * a single chunk type per message is supported currently. */ #define RPCRDMA_MIN_SLOT_TABLE (2U) -#define RPCRDMA_DEF_SLOT_TABLE (32U) +#define RPCRDMA_DEF_SLOT_TABLE (128U) #define RPCRDMA_MAX_SLOT_TABLE (256U) #define RPCRDMA_DEF_INLINE (1024) /* default inline max */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b0f898e..43c1cf0 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2760,52 +2760,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev, } /** - * ib_reg_phys_mr - Prepares a virtually addressed memory region for use - * by an HCA. - * @pd: The protection domain associated assigned to the registered region. - * @phys_buf_array: Specifies a list of physical buffers to use in the - * memory region. - * @num_phys_buf: Specifies the size of the phys_buf_array. - * @mr_access_flags: Specifies the memory access rights. - * @iova_start: The offset of the region's starting I/O virtual address. - */ -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); - -/** - * ib_rereg_phys_mr - Modifies the attributes of an existing memory region. - * Conceptually, this call performs the functions deregister memory region - * followed by register physical memory region. Where possible, - * resources are reused instead of deallocated and reallocated. - * @mr: The memory region to modify. - * @mr_rereg_mask: A bit-mask used to indicate which of the following - * properties of the memory region are being modified. - * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies - * the new protection domain to associated with the memory region, - * otherwise, this parameter is ignored. - * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this - * field specifies a list of physical buffers to use in the new - * translation, otherwise, this parameter is ignored. - * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this - * field specifies the size of the phys_buf_array, otherwise, this - * parameter is ignored. - * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this - * field specifies the new memory access rights, otherwise, this - * parameter is ignored. - * @iova_start: The offset of the region's starting I/O virtual address. - */ -int ib_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); - -/** * ib_query_mr - Retrieves information about a specific memory region. * @mr: The memory region to retrieve information about. * @mr_attr: The attributes of the specified memory region. diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 2119c7c..2b871e0 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -15,7 +15,7 @@ #include <linux/types.h> -#define NFS4_BITMAP_SIZE 2 +#define NFS4_BITMAP_SIZE 3 #define NFS4_VERIFIER_SIZE 8 #define NFS4_STATEID_SEQID_SIZE 4 #define NFS4_STATEID_OTHER_SIZE 12 diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4feda2d..548240d 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -23,7 +23,7 @@ struct unx_cred { }; #define uc_uid uc_base.cr_uid -#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) +#define UNX_WRITESLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME)) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index f1e8daf..cb25c89 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -39,6 +39,25 @@ static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + struct ib_device_attr *devattr = &ia->ri_devattr; + struct ib_mr *mr; + + /* Obtain an lkey to use for the regbufs, which are + * protected from remote access. + */ + if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; + } else { + mr = ib_get_dma_mr(ia->ri_pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(mr)) { + pr_err("%s: ib_get_dma_mr for failed with %lX\n", + __func__, PTR_ERR(mr)); + return -ENOMEM; + } + ia->ri_dma_lkey = ia->ri_dma_mr->lkey; + ia->ri_dma_mr = mr; + } + return 0; } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 04ea914..63f282e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -189,6 +189,11 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct ib_device_attr *devattr = &ia->ri_devattr; int depth, delta; + /* Obtain an lkey to use for the regbufs, which are + * protected from remote access. + */ + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; + ia->ri_max_frmr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, devattr->max_fast_reg_page_list_len); diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 41985d0..72cf8b1 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -23,6 +23,29 @@ static int physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + struct ib_device_attr *devattr = &ia->ri_devattr; + struct ib_mr *mr; + + /* Obtain an rkey to use for RPC data payloads. + */ + mr = ib_get_dma_mr(ia->ri_pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (IS_ERR(mr)) { + pr_err("%s: ib_get_dma_mr for failed with %lX\n", + __func__, PTR_ERR(mr)); + return -ENOMEM; + } + ia->ri_dma_mr = mr; + + /* Obtain an lkey to use for regbufs. + */ + if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; + else + ia->ri_dma_lkey = ia->ri_dma_mr->lkey; + return 0; } @@ -51,7 +74,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_ia *ia = &r_xprt->rx_ia; rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); - seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_rkey = ia->ri_dma_mr->rkey; seg->mr_base = seg->mr_dma; seg->mr_nsegs = 1; return 1; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 84ea37d..bc8bd65 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,6 +71,67 @@ static const char transfertypes[][12] = { }; #endif +/* The client can send a request inline as long as the RPCRDMA header + * plus the RPC call fit under the transport's inline limit. If the + * combined call message size exceeds that limit, the client must use + * the read chunk list for this operation. + */ +static bool rpcrdma_args_inline(struct rpc_rqst *rqst) +{ + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len; + + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); +} + +/* The client can't know how large the actual reply will be. Thus it + * plans for the largest possible reply for that particular ULP + * operation. If the maximum combined reply message size exceeds that + * limit, the client must provide a write list or a reply chunk for + * this request. + */ +static bool rpcrdma_results_inline(struct rpc_rqst *rqst) +{ + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen; + + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst); +} + +static int +rpcrdma_tail_pullup(struct xdr_buf *buf) +{ + size_t tlen = buf->tail[0].iov_len; + size_t skip = tlen & 3; + + /* Do not include the tail if it is only an XDR pad */ + if (tlen < 4) + return 0; + + /* xdr_write_pages() adds a pad at the beginning of the tail + * if the content in "buf->pages" is unaligned. Force the + * tail's actual content to land at the next XDR position + * after the head instead. + */ + if (skip) { + unsigned char *src, *dst; + unsigned int count; + + src = buf->tail[0].iov_base; + dst = buf->head[0].iov_base; + dst += buf->head[0].iov_len; + + src += skip; + tlen -= skip; + + dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n", + __func__, skip, dst, src, tlen); + + for (count = tlen; count; count--) + *dst++ = *src++; + } + + return tlen; +} + /* * Chunk assembly from upper layer xdr_buf. * @@ -122,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (len && n == nsegs) return -EIO; + /* When encoding the read list, the tail is always sent inline */ + if (type == rpcrdma_readch) + return n; + if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing * xdr pad bytes, saving the server an RDMA operation. */ @@ -297,8 +362,7 @@ out: * pre-registered memory buffer for this request. For small amounts * of data, this is efficient. The cutoff value is tunable. */ -static int -rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) +static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) { int i, npages, curlen; int copy_len; @@ -310,16 +374,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) destp = rqst->rq_svec[0].iov_base; curlen = rqst->rq_svec[0].iov_len; destp += curlen; - /* - * Do optional padding where it makes sense. Alignment of write - * payload can help the server, if our setting is accurate. - */ - pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); - if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) - pad = 0; /* don't pad this request */ - dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", - __func__, pad, destp, rqst->rq_slen, curlen); + dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", + __func__, destp, rqst->rq_slen, curlen); copy_len = rqst->rq_snd_buf.page_len; @@ -355,7 +412,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) page_base = 0; } /* header now contains entire send message */ - return pad; } /* @@ -380,7 +436,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); char *base; - size_t rpclen, padlen; + size_t rpclen; ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; @@ -402,28 +458,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* * Chunks needed for results? * + * o Read ops return data as write chunk(s), header as inline. * o If the expected result is under the inline threshold, all ops - * return as inline (but see later). + * return as inline. * o Large non-read ops return as a single reply chunk. - * o Large read ops return data as write chunk(s), header as inline. - * - * Note: the NFS code sending down multiple result segments implies - * the op is one of read, readdir[plus], readlink or NFSv4 getacl. - */ - - /* - * This code can handle read chunks, write chunks OR reply - * chunks -- only one type. If the request is too big to fit - * inline, then we will choose read chunks. If the request is - * a READ, then use write chunks to separate the file data - * into pages; otherwise use reply chunks. */ - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - wtype = rpcrdma_noch; - else if (rqst->rq_rcv_buf.page_len == 0) - wtype = rpcrdma_replych; - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + if (rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; + else if (rpcrdma_results_inline(rqst)) + wtype = rpcrdma_noch; else wtype = rpcrdma_replych; @@ -432,21 +475,25 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * * o If the total request is under the inline threshold, all ops * are sent as inline. - * o Large non-write ops are sent with the entire message as a - * single read chunk (protocol 0-position special case). * o Large write ops transmit data as read chunk(s), header as * inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). * - * Note: the NFS code sending down multiple argument segments - * implies the op is a write. - * TBD check NFSv4 setacl + * This assumes that the upper layer does not present a request + * that both has a data payload, and whose non-data arguments + * by themselves are larger than the inline threshold. */ - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + if (rpcrdma_args_inline(rqst)) { rtype = rpcrdma_noch; - else if (rqst->rq_snd_buf.page_len == 0) - rtype = rpcrdma_areadch; - else + } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; + } else { + r_xprt->rx_stats.nomsg_call_count++; + headerp->rm_type = htonl(RDMA_NOMSG); + rtype = rpcrdma_areadch; + rpclen = 0; + } /* The following simplification is not true forever */ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) @@ -458,7 +505,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) } hdrlen = RPCRDMA_HDRLEN_MIN; - padlen = 0; /* * Pull up any extra send data into the preregistered buffer. @@ -467,45 +513,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rtype == rpcrdma_noch) { - padlen = rpcrdma_inline_pullup(rqst, - RPCRDMA_INLINE_PAD_VALUE(rqst)); - - if (padlen) { - headerp->rm_type = rdma_msgp; - headerp->rm_body.rm_padded.rm_align = - cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst)); - headerp->rm_body.rm_padded.rm_thresh = - cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH); - headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; - headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; - headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; - hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (wtype != rpcrdma_noch) { - dprintk("RPC: %s: invalid chunk list\n", - __func__); - return -EIO; - } - } else { - headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; - /* new length after pullup */ - rpclen = rqst->rq_svec[0].iov_len; - /* - * Currently we try to not actually use read inline. - * Reply chunks have the desirable property that - * they land, packed, directly in the target buffers - * without headers, so they require no fixup. The - * additional RDMA Write op sends the same amount - * of data, streams on-the-wire and adds no overhead - * on receive. Therefore, we request a reply chunk - * for non-writes wherever feasible and efficient. - */ - if (wtype == rpcrdma_noch) - wtype = rpcrdma_replych; - } - } + rpcrdma_inline_pullup(rqst); + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; + /* new length after pullup */ + rpclen = rqst->rq_svec[0].iov_len; + } else if (rtype == rpcrdma_readch) + rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); if (rtype != rpcrdma_noch) { hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, headerp, rtype); @@ -518,9 +534,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen < 0) return hdrlen; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* @@ -534,26 +550,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); + req->rl_niovs = 1; + if (rtype == rpcrdma_areadch) + return 0; + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); req->rl_niovs = 2; - - if (padlen) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf); - req->rl_send_iov[2].length = padlen; - req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf); - - req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; - req->rl_send_iov[3].length = rqst->rq_slen - rpclen; - req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf); - - req->rl_niovs = 4; - } - return 0; } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 680f888..64443eb 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -175,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) } static void -xprt_rdma_format_addresses(struct rpc_xprt *xprt) +xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { - struct sockaddr *sap = (struct sockaddr *) - &rpcx_to_rdmad(xprt).addr; char buf[128]; switch (sap->sa_family) { @@ -302,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args) struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; struct rpcrdma_ep *new_ep; - struct sockaddr_in *sin; + struct sockaddr *sap; int rc; if (args->addrlen > sizeof(xprt->addr)) { @@ -333,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args) * Set up RDMA-specific connect data. */ - /* Put server RDMA address in local cdata */ - memcpy(&cdata.addr, args->dstaddr, args->addrlen); + sap = (struct sockaddr *)&cdata.addr; + memcpy(sap, args->dstaddr, args->addrlen); /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ xprt->prot = IPPROTO_TCP; xprt->addrlen = args->addrlen; - memcpy(&xprt->addr, &cdata.addr, xprt->addrlen); + memcpy(&xprt->addr, sap, xprt->addrlen); - sin = (struct sockaddr_in *)&cdata.addr; - if (ntohs(sin->sin_port) != 0) + if (rpc_get_port(sap)) xprt_set_bound(xprt); - dprintk("RPC: %s: %pI4:%u\n", - __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port)); - - /* Set max requests */ cdata.max_requests = xprt->max_reqs; - /* Set some length limits */ cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ @@ -375,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr, - xprt_rdma_memreg_strategy); + rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy); if (rc) goto out1; @@ -409,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt_rdma_format_addresses(xprt); + xprt_rdma_format_addresses(xprt, sap); xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; @@ -420,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args) if (!try_module_get(THIS_MODULE)) goto out4; + dprintk("RPC: %s: %s:%s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT]); return xprt; out4: @@ -653,31 +647,30 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) if (xprt_connected(xprt)) idle_time = (long)(jiffies - xprt->last_used) / HZ; - seq_printf(seq, - "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu " - "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n", - - 0, /* need a local port? */ - xprt->stat.bind_count, - xprt->stat.connect_count, - xprt->stat.connect_time, - idle_time, - xprt->stat.sends, - xprt->stat.recvs, - xprt->stat.bad_xids, - xprt->stat.req_u, - xprt->stat.bklog_u, - - r_xprt->rx_stats.read_chunk_count, - r_xprt->rx_stats.write_chunk_count, - r_xprt->rx_stats.reply_chunk_count, - r_xprt->rx_stats.total_rdma_request, - r_xprt->rx_stats.total_rdma_reply, - r_xprt->rx_stats.pullup_copy_count, - r_xprt->rx_stats.fixup_copy_count, - r_xprt->rx_stats.hardway_register_count, - r_xprt->rx_stats.failed_marshal_count, - r_xprt->rx_stats.bad_reply_count); + seq_puts(seq, "\txprt:\trdma "); + seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ", + 0, /* need a local port? */ + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", + r_xprt->rx_stats.read_chunk_count, + r_xprt->rx_stats.write_chunk_count, + r_xprt->rx_stats.reply_chunk_count, + r_xprt->rx_stats.total_rdma_request, + r_xprt->rx_stats.total_rdma_reply, + r_xprt->rx_stats.pullup_copy_count, + r_xprt->rx_stats.fixup_copy_count, + r_xprt->rx_stats.hardway_register_count, + r_xprt->rx_stats.failed_marshal_count, + r_xprt->rx_stats.bad_reply_count, + r_xprt->rx_stats.nomsg_call_count); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 891c4ed..f73d7a7 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -52,6 +52,7 @@ #include <linux/prefetch.h> #include <linux/sunrpc/addr.h> #include <asm/bitops.h> +#include <linux/module.h> /* try_module_get()/module_put() */ #include "xprt_rdma.h" @@ -414,6 +415,14 @@ connected: return 0; } +static void rpcrdma_destroy_id(struct rdma_cm_id *id) +{ + if (id) { + module_put(id->device->owner); + rdma_destroy_id(id); + } +} + static struct rdma_cm_id * rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr) @@ -440,6 +449,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } wait_for_completion_interruptible_timeout(&ia->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + + /* FIXME: + * Until xprtrdma supports DEVICE_REMOVAL, the provider must + * be pinned while there are active NFS/RDMA mounts to prevent + * hangs and crashes at umount time. + */ + if (!ia->ri_async_rc && !try_module_get(id->device->owner)) { + dprintk("RPC: %s: Failed to get device module\n", + __func__); + ia->ri_async_rc = -ENODEV; + } rc = ia->ri_async_rc; if (rc) goto out; @@ -449,16 +469,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); - goto out; + goto put; } wait_for_completion_interruptible_timeout(&ia->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) - goto out; + goto put; return id; - +put: + module_put(id->device->owner); out: rdma_destroy_id(id); return ERR_PTR(rc); @@ -493,9 +514,11 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc, mem_priv; struct rpcrdma_ia *ia = &xprt->rx_ia; struct ib_device_attr *devattr = &ia->ri_devattr; + int rc; + + ia->ri_dma_mr = NULL; ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { @@ -519,11 +542,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out3; } - if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { - ia->ri_have_dma_lkey = 1; - ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; - } - if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ if (((devattr->device_cap_flags & @@ -539,42 +557,19 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (!ia->ri_device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); - memreg = RPCRDMA_ALLPHYSICAL; + goto out3; } } - /* - * Optionally obtain an underlying physical identity mapping in - * order to do a memory window-based bind. This base registration - * is protected from remote access - that is enabled only by binding - * for the specific bytes targeted during each RPC operation, and - * revoked after the corresponding completion similar to a storage - * adapter. - */ switch (memreg) { case RPCRDMA_FRMR: ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: ia->ri_ops = &rpcrdma_physical_memreg_ops; - mem_priv = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ; - goto register_setup; + break; case RPCRDMA_MTHCAFMR: ia->ri_ops = &rpcrdma_fmr_memreg_ops; - if (ia->ri_have_dma_lkey) - break; - mem_priv = IB_ACCESS_LOCAL_WRITE; - register_setup: - ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); - if (IS_ERR(ia->ri_bind_mem)) { - printk(KERN_ALERT "%s: ib_get_dma_mr for " - "phys register failed with %lX\n", - __func__, PTR_ERR(ia->ri_bind_mem)); - rc = -ENOMEM; - goto out3; - } break; default: printk(KERN_ERR "RPC: Unsupported memory " @@ -592,7 +587,7 @@ out3: ib_dealloc_pd(ia->ri_pd); ia->ri_pd = NULL; out2: - rdma_destroy_id(ia->ri_id); + rpcrdma_destroy_id(ia->ri_id); ia->ri_id = NULL; out1: return rc; @@ -606,19 +601,11 @@ out1: void rpcrdma_ia_close(struct rpcrdma_ia *ia) { - int rc; - dprintk("RPC: %s: entering\n", __func__); - if (ia->ri_bind_mem != NULL) { - rc = ib_dereg_mr(ia->ri_bind_mem); - dprintk("RPC: %s: ib_dereg_mr returned %i\n", - __func__, rc); - } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); + rpcrdma_destroy_id(ia->ri_id); ia->ri_id = NULL; } @@ -639,6 +626,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct ib_cq_init_attr cq_attr = {}; int rc, err; + if (devattr->max_sge < RPCRDMA_MAX_IOVS) { + dprintk("RPC: %s: insufficient sge's available\n", + __func__); + return -ENOMEM; + } + /* check provider's send/recv wr limits */ if (cdata->max_requests > devattr->max_qp_wr) cdata->max_requests = devattr->max_qp_wr; @@ -651,21 +644,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; - ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; ep->rep_attr.qp_type = IB_QPT_RC; ep->rep_attr.port_num = ~0; - if (cdata->padding) { - ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding, - GFP_KERNEL); - if (IS_ERR(ep->rep_padbuf)) - return PTR_ERR(ep->rep_padbuf); - } else - ep->rep_padbuf = NULL; - dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, @@ -748,7 +733,8 @@ out2: dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, err); out1: - rpcrdma_free_regbuf(ia, ep->rep_padbuf); + if (ia->ri_dma_mr) + ib_dereg_mr(ia->ri_dma_mr); return rc; } @@ -775,8 +761,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ia->ri_id->qp = NULL; } - rpcrdma_free_regbuf(ia, ep->rep_padbuf); - rpcrdma_clean_cq(ep->rep_attr.recv_cq); rc = ib_destroy_cq(ep->rep_attr.recv_cq); if (rc) @@ -788,6 +772,12 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); + + if (ia->ri_dma_mr) { + rc = ib_dereg_mr(ia->ri_dma_mr); + dprintk("RPC: %s: ib_dereg_mr returned %i\n", + __func__, rc); + } } /* @@ -825,7 +815,7 @@ retry: if (ia->ri_device != id->device) { printk("RPC: %s: can't reconnect on " "different device!\n", __func__); - rdma_destroy_id(id); + rpcrdma_destroy_id(id); rc = -ENETUNREACH; goto out; } @@ -834,7 +824,7 @@ retry: if (rc) { dprintk("RPC: %s: rdma_create_qp failed %i\n", __func__, rc); - rdma_destroy_id(id); + rpcrdma_destroy_id(id); rc = -ENETUNREACH; goto out; } @@ -845,7 +835,7 @@ retry: write_unlock(&ia->ri_qplock); rdma_destroy_qp(old); - rdma_destroy_id(old); + rpcrdma_destroy_id(old); } else { dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); @@ -1229,75 +1219,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) (unsigned long long)seg->mr_dma, seg->mr_dmalen); } -static int -rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, - struct ib_mr **mrp, struct ib_sge *iov) -{ - struct ib_phys_buf ipb; - struct ib_mr *mr; - int rc; - - /* - * All memory passed here was kmalloc'ed, therefore phys-contiguous. - */ - iov->addr = ib_dma_map_single(ia->ri_device, - va, len, DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(ia->ri_device, iov->addr)) - return -ENOMEM; - - iov->length = len; - - if (ia->ri_have_dma_lkey) { - *mrp = NULL; - iov->lkey = ia->ri_dma_lkey; - return 0; - } else if (ia->ri_bind_mem != NULL) { - *mrp = NULL; - iov->lkey = ia->ri_bind_mem->lkey; - return 0; - } - - ipb.addr = iov->addr; - ipb.size = iov->length; - mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, - IB_ACCESS_LOCAL_WRITE, &iov->addr); - - dprintk("RPC: %s: phys convert: 0x%llx " - "registered 0x%llx length %d\n", - __func__, (unsigned long long)ipb.addr, - (unsigned long long)iov->addr, len); - - if (IS_ERR(mr)) { - *mrp = NULL; - rc = PTR_ERR(mr); - dprintk("RPC: %s: failed with %i\n", __func__, rc); - } else { - *mrp = mr; - iov->lkey = mr->lkey; - rc = 0; - } - - return rc; -} - -static int -rpcrdma_deregister_internal(struct rpcrdma_ia *ia, - struct ib_mr *mr, struct ib_sge *iov) -{ - int rc; - - ib_dma_unmap_single(ia->ri_device, - iov->addr, iov->length, DMA_BIDIRECTIONAL); - - if (NULL == mr) - return 0; - - rc = ib_dereg_mr(mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); - return rc; -} - /** * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers * @ia: controlling rpcrdma_ia @@ -1317,26 +1238,29 @@ struct rpcrdma_regbuf * rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) { struct rpcrdma_regbuf *rb; - int rc; + struct ib_sge *iov; - rc = -ENOMEM; rb = kmalloc(sizeof(*rb) + size, flags); if (rb == NULL) goto out; - rb->rg_size = size; - rb->rg_owner = NULL; - rc = rpcrdma_register_internal(ia, rb->rg_base, size, - &rb->rg_mr, &rb->rg_iov); - if (rc) + iov = &rb->rg_iov; + iov->addr = ib_dma_map_single(ia->ri_device, + (void *)rb->rg_base, size, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_device, iov->addr)) goto out_free; + iov->length = size; + iov->lkey = ia->ri_dma_lkey; + rb->rg_size = size; + rb->rg_owner = NULL; return rb; out_free: kfree(rb); out: - return ERR_PTR(rc); + return ERR_PTR(-ENOMEM); } /** @@ -1347,10 +1271,15 @@ out: void rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) { - if (rb) { - rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov); - kfree(rb); - } + struct ib_sge *iov; + + if (!rb) + return; + + iov = &rb->rg_iov; + ib_dma_unmap_single(ia->ri_device, + iov->addr, iov->length, DMA_BIDIRECTIONAL); + kfree(rb); } /* @@ -1363,9 +1292,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { + struct ib_device *device = ia->ri_device; struct ib_send_wr send_wr, *send_wr_fail; struct rpcrdma_rep *rep = req->rl_reply; - int rc; + struct ib_sge *iov = req->rl_send_iov; + int i, rc; if (rep) { rc = rpcrdma_ep_post_recv(ia, ep, rep); @@ -1376,22 +1307,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr.next = NULL; send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; - send_wr.sg_list = req->rl_send_iov; + send_wr.sg_list = iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[3].addr, - req->rl_send_iov[3].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[1].addr, - req->rl_send_iov[1].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[0].addr, - req->rl_send_iov[0].length, - DMA_TO_DEVICE); + + for (i = 0; i < send_wr.num_sge; i++) + ib_dma_sync_single_for_device(device, iov[i].addr, + iov[i].length, DMA_TO_DEVICE); + dprintk("RPC: %s: posting %d s/g entries\n", + __func__, send_wr.num_sge); if (DECR_CQCOUNT(ep) > 0) send_wr.send_flags = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e718d09..0251222 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -64,9 +64,8 @@ struct rpcrdma_ia { struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct ib_mr *ri_bind_mem; + struct ib_mr *ri_dma_mr; u32 ri_dma_lkey; - int ri_have_dma_lkey; struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; @@ -88,7 +87,6 @@ struct rpcrdma_ep { int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; - struct rpcrdma_regbuf *rep_padbuf; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; @@ -118,7 +116,6 @@ struct rpcrdma_ep { struct rpcrdma_regbuf { size_t rg_size; struct rpcrdma_req *rg_owner; - struct ib_mr *rg_mr; struct ib_sge rg_iov; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -164,8 +161,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * struct rpcrdma_buffer. N is the max number of outstanding requests. */ -/* temporary static scatter/gather max */ -#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */ +#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ struct rpcrdma_buffer; @@ -257,16 +253,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; +#define RPCRDMA_MAX_IOVS (2) + struct rpcrdma_req { - unsigned int rl_niovs; /* 0, 2 or 4 */ - unsigned int rl_nchunks; /* non-zero if chunks */ - unsigned int rl_connect_cookie; /* retry detection */ - struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ + unsigned int rl_niovs; + unsigned int rl_nchunks; + unsigned int rl_connect_cookie; + struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ - struct ib_sge rl_send_iov[4]; /* for active requests */ - struct rpcrdma_regbuf *rl_rdmabuf; - struct rpcrdma_regbuf *rl_sendbuf; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; + struct rpcrdma_regbuf *rl_rdmabuf; + struct rpcrdma_regbuf *rl_sendbuf; + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -341,6 +339,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; + unsigned long nomsg_call_count; }; /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0030376..7be90bc 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -822,6 +822,8 @@ static void xs_reset_transport(struct sock_xprt *transport) if (atomic_read(&transport->xprt.swapper)) sk_clear_memalloc(sk); + kernel_sock_shutdown(sock, SHUT_RDWR); + write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; @@ -829,6 +831,7 @@ static void xs_reset_transport(struct sock_xprt *transport) sk->sk_user_data = NULL; xs_restore_old_callbacks(transport, sk); + xprt_clear_connected(xprt); write_unlock_bh(&sk->sk_callback_lock); xs_sock_reset_connection_flags(xprt); @@ -1866,7 +1869,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_data_ready = xs_local_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_error_report = xs_error_report; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; xprt_clear_connected(xprt); @@ -2051,7 +2054,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; xprt_set_connected(xprt); @@ -2153,7 +2156,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); @@ -2279,13 +2282,14 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); - /* Start by resetting any existing state */ - xs_reset_transport(transport); - - if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { + if (transport->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", xprt, xprt->reestablish_timeout / HZ); + + /* Start by resetting any existing state */ + xs_reset_transport(transport); + queue_delayed_work(rpciod_workqueue, &transport->connect_worker, xprt->reestablish_timeout); |